Phase 5Evaluation and Security·6 min read

Observability & Tracing with LangSmith and Phoenix

Phase 5 of 8

You've built agents and RAG systems. But how do you know they're working well? How do you debug when things go wrong? Welcome to observability - seeing exactly what your AI systems are doing.

Coming from Software Engineering? LangSmith and Phoenix are the Datadog/New Relic/Jaeger of the AI world. They provide distributed tracing for LLM calls — each trace shows the full chain of prompts, retrievals, and tool calls, just like a distributed trace shows HTTP hops across microservices. If you've used OpenTelemetry, structured logging, or APM tools, you already understand the "instrument, collect, visualize, alert" pipeline. The concepts are identical; the telemetry data just includes tokens and prompts instead of HTTP status codes.


Why Observability Matters

What you can see:

  • Every LLM call (prompts, responses, tokens, latency)
  • Tool executions
  • Retrieval operations
  • Error traces
  • Cost breakdowns

LangSmith: Production Tracing

Setup

pip install langsmith langchain langchain-openai
# script_id: day_056_langsmith_phoenix/langsmith_traced_pipeline
import os

# Set environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "your-langsmith-api-key"
os.environ["LANGCHAIN_PROJECT"] = "my-ai-project"

Basic Tracing

# script_id: day_056_langsmith_phoenix/langsmith_traced_pipeline
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

# LangSmith automatically traces LangChain operations!
llm = ChatOpenAI(model="gpt-4o-mini")

response = llm.invoke([HumanMessage(content="What is AI?")])
print(response.content)

# Check LangSmith dashboard to see the trace!

Tracing Custom Functions

# script_id: day_056_langsmith_phoenix/langsmith_traced_pipeline
from langsmith import traceable

@traceable(name="my_rag_pipeline")
def rag_query(question: str) -> str:
    """A traced RAG pipeline."""

    # Step 1: Embed
    embedding = embed_question(question)

    # Step 2: Retrieve
    docs = retrieve_documents(embedding)

    # Step 3: Generate
    answer = generate_answer(question, docs)

    return answer

@traceable(name="embed_question")
def embed_question(question: str) -> list:
    """Embed the question."""
    # Your embedding logic
    return [0.1, 0.2, 0.3]

@traceable(name="retrieve_documents")
def retrieve_documents(embedding: list) -> list:
    """Retrieve relevant documents."""
    # Your retrieval logic
    return ["doc1", "doc2"]

@traceable(name="generate_answer")
def generate_answer(question: str, docs: list) -> str:
    """Generate the final answer."""
    llm = ChatOpenAI(model="gpt-4o-mini")
    context = "\n".join(docs)
    response = llm.invoke([
        HumanMessage(content=f"Context: {context}\n\nQuestion: {question}")
    ])
    return response.content

# When you call this, all steps are traced!
answer = rag_query("What is machine learning?")

Adding Metadata

# script_id: day_056_langsmith_phoenix/langsmith_metadata
from langsmith import traceable

@traceable(
    name="production_query",
    tags=["production", "v2"],
    metadata={"version": "2.0", "team": "ml"}
)
def production_query(user_id: str, query: str) -> str:
    """Query with rich metadata for filtering."""
    # ...
    pass

# Or add run-time metadata
from langsmith.run_helpers import get_current_run_tree

@traceable
def process_query(query: str):
    run = get_current_run_tree()
    if run:
        run.metadata["query_length"] = len(query)
        run.tags.append("short" if len(query) < 50 else "long")
    # ...

Phoenix: Open-Source Tracing

Phoenix is a free, open-source alternative:

Setup

pip install arize-phoenix opentelemetry-sdk opentelemetry-exporter-otlp
# script_id: day_056_langsmith_phoenix/phoenix_instrument_openai
import phoenix as px

# Launch Phoenix (opens web UI)
session = px.launch_app()
print(f"Phoenix UI: {session.url}")

Instrument OpenAI

# script_id: day_056_langsmith_phoenix/phoenix_instrument_openai
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor

# Setup tracing
tracer_provider = register(project_name="my-project")
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

# Now all OpenAI calls are traced!
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "Hello!"}]
)
# Check Phoenix UI to see traces

Custom Spans

# script_id: day_056_langsmith_phoenix/phoenix_custom_spans
from opentelemetry import trace

tracer = trace.get_tracer(__name__)

def my_pipeline(query: str):
    with tracer.start_as_current_span("my_pipeline") as span:
        span.set_attribute("query", query)

        with tracer.start_as_current_span("step_1"):
            result1 = do_step_1(query)

        with tracer.start_as_current_span("step_2"):
            result2 = do_step_2(result1)

        span.set_attribute("success", True)
        return result2

Key Metrics to Track

Building a Metrics Dashboard

# script_id: day_056_langsmith_phoenix/metrics_dashboard
from dataclasses import dataclass, field
from datetime import datetime
from collections import defaultdict
import statistics

@dataclass
class QueryMetrics:
    query_id: str
    timestamp: datetime
    latency_ms: float
    input_tokens: int
    output_tokens: int
    total_cost: float
    success: bool
    error: str = None

class MetricsCollector:
    """Collect and analyze metrics."""

    def __init__(self):
        self.metrics: list[QueryMetrics] = []

    def record(self, metrics: QueryMetrics):
        """Record a query's metrics."""
        self.metrics.append(metrics)

    def summary(self, last_n: int = None) -> dict:
        """Get summary statistics."""
        data = self.metrics[-last_n:] if last_n else self.metrics

        if not data:
            return {"error": "No data"}

        latencies = [m.latency_ms for m in data]
        costs = [m.total_cost for m in data]
        successes = [m.success for m in data]

        return {
            "total_queries": len(data),
            "success_rate": sum(successes) / len(successes),
            "latency": {
                "mean": statistics.mean(latencies),
                "median": statistics.median(latencies),
                "p95": sorted(latencies)[int(len(latencies) * 0.95)] if len(latencies) > 20 else max(latencies),
            },
            "cost": {
                "total": sum(costs),
                "average": statistics.mean(costs),
            },
            "tokens": {
                "input_total": sum(m.input_tokens for m in data),
                "output_total": sum(m.output_tokens for m in data),
            }
        }

# Usage
collector = MetricsCollector()

# After each query
collector.record(QueryMetrics(
    query_id="q123",
    timestamp=datetime.now(),
    latency_ms=1500,
    input_tokens=150,
    output_tokens=200,
    total_cost=0.001,
    success=True
))

print(collector.summary())

Debugging with Traces

Common Issues to Look For

Trace Analysis Code

# script_id: day_056_langsmith_phoenix/trace_analysis
def analyze_trace(trace: dict) -> dict:
    """Analyze a trace for issues."""
    issues = []

    # Check latency
    total_latency = trace.get("latency_ms", 0)
    if total_latency > 5000:
        issues.append({
            "type": "high_latency",
            "severity": "warning",
            "details": f"Total latency {total_latency}ms exceeds 5s threshold"
        })

    # Check token usage
    input_tokens = trace.get("input_tokens", 0)
    if input_tokens > 3000:
        issues.append({
            "type": "high_token_usage",
            "severity": "info",
            "details": f"Input tokens ({input_tokens}) are high, consider summarization"
        })

    # Check for errors
    if trace.get("error"):
        issues.append({
            "type": "error",
            "severity": "critical",
            "details": trace["error"]
        })

    # Check retrieval relevance
    retrieval_scores = trace.get("retrieval_scores", [])
    if retrieval_scores and max(retrieval_scores) < 0.7:
        issues.append({
            "type": "low_relevance",
            "severity": "warning",
            "details": f"Best retrieval score ({max(retrieval_scores):.2f}) is low"
        })

    return {
        "trace_id": trace.get("id"),
        "issues": issues,
        "issue_count": len(issues),
        "has_critical": any(i["severity"] == "critical" for i in issues)
    }

Summary

Also worth knowing: Langfuse is an open-source alternative to LangSmith that offers tracing, evaluation, and prompt management. It's self-hostable and has a generous free tier.


Quick Reference

# script_id: day_056_langsmith_phoenix/quick_reference
# LangSmith
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "key"

from langsmith import traceable
@traceable
def my_function(): pass

# Phoenix
import phoenix as px
px.launch_app()

from phoenix.otel import register
register(project_name="my-project")

What's Next?

Now that you can see what's happening, let's learn how to measure quality with automated evaluation!