Phase 5Evaluation and Security·7 min read

Evaluating Agent Trajectories

Phase 5 of 8

Did your agent take the best path? Trajectory evaluation measures not just the final answer, but how the agent got there.

Coming from Software Engineering? Trajectory evaluation is algorithmic complexity analysis applied to agent behavior. Just as you'd evaluate whether an algorithm took O(n) or O(n²) steps, you're evaluating whether the agent found the answer efficiently or wandered. Think of it like profiling a slow endpoint — you trace the execution path and ask "were all these steps necessary, or did it make redundant calls?" The metrics (step efficiency, tool selection accuracy, goal convergence) are performance metrics for reasoning.


What is Trajectory Evaluation?

Trajectory evaluation asks:

  • Did the agent take unnecessary steps?
  • Were the steps in the right order?
  • Did each step contribute to the goal?
  • Was the path efficient?

Capturing Agent Trajectories

First, record what the agent does:

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime
import json

@dataclass
class AgentStep:
    """Single step in agent trajectory."""
    step_number: int
    thought: str
    action: str
    action_input: Dict
    observation: str
    timestamp: datetime = field(default_factory=datetime.now)

@dataclass
class AgentTrajectory:
    """Complete agent trajectory."""
    task: str
    steps: List[AgentStep] = field(default_factory=list)
    final_answer: str = ""
    success: bool = False
    start_time: datetime = field(default_factory=datetime.now)
    end_time: Optional[datetime] = None

    def add_step(self, thought: str, action: str, action_input: Dict, observation: str):
        """Add a step to the trajectory."""
        step = AgentStep(
            step_number=len(self.steps) + 1,
            thought=thought,
            action=action,
            action_input=action_input,
            observation=observation
        )
        self.steps.append(step)

    def complete(self, final_answer: str, success: bool):
        """Mark trajectory as complete."""
        self.final_answer = final_answer
        self.success = success
        self.end_time = datetime.now()

    def to_dict(self) -> Dict:
        """Convert to dictionary."""
        return {
            "task": self.task,
            "steps": [
                {
                    "step": s.step_number,
                    "thought": s.thought,
                    "action": s.action,
                    "action_input": s.action_input,
                    "observation": s.observation
                }
                for s in self.steps
            ],
            "final_answer": self.final_answer,
            "success": self.success,
            "total_steps": len(self.steps)
        }

# Usage example
trajectory = AgentTrajectory(task="Find the weather in Tokyo and convert to Celsius")

trajectory.add_step(
    thought="I need to search for Tokyo weather",
    action="search",
    action_input={"query": "Tokyo weather"},
    observation="Weather in Tokyo: 72°F, sunny"
)

trajectory.add_step(
    thought="I need to convert 72°F to Celsius",
    action="calculate",
    action_input={"expression": "(72-32)*5/9"},
    observation="Result: 22.2"
)

trajectory.complete(
    final_answer="The weather in Tokyo is 22°C and sunny",
    success=True
)

Trajectory Evaluation Metrics

1. Step Efficiency

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
def evaluate_efficiency(trajectory: AgentTrajectory, optimal_steps: int) -> Dict:
    """Evaluate if agent took efficient path."""

    actual_steps = len(trajectory.steps)

    if actual_steps <= optimal_steps:
        efficiency = 1.0
    else:
        # Penalty for extra steps
        efficiency = optimal_steps / actual_steps

    return {
        "actual_steps": actual_steps,
        "optimal_steps": optimal_steps,
        "efficiency_score": efficiency,
        "excess_steps": max(0, actual_steps - optimal_steps)
    }

# Usage
result = evaluate_efficiency(trajectory, optimal_steps=2)
print(f"Efficiency: {result['efficiency_score']:.2%}")

2. Step Relevance

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
from openai import OpenAI

client = OpenAI()

def evaluate_step_relevance(step: AgentStep, task: str) -> Dict:
    """Evaluate if a step was relevant to the task."""

    prompt = f"""Evaluate if this agent step was relevant to accomplishing the task.

Task: {task}

Step:
- Thought: {step.thought}
- Action: {step.action}
- Input: {step.action_input}
- Result: {step.observation}

Rate the relevance from 0-1 where:
- 1.0 = Directly necessary for the task
- 0.5 = Somewhat useful
- 0.0 = Completely irrelevant

Return JSON: {{"relevance": 0.0-1.0, "reasoning": "..."}}"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

def evaluate_all_steps_relevance(trajectory: AgentTrajectory) -> Dict:
    """Evaluate relevance of all steps."""

    step_scores = []
    for step in trajectory.steps:
        result = evaluate_step_relevance(step, trajectory.task)
        step_scores.append({
            "step": step.step_number,
            "relevance": result["relevance"],
            "reasoning": result["reasoning"]
        })

    avg_relevance = sum(s["relevance"] for s in step_scores) / len(step_scores)

    return {
        "step_scores": step_scores,
        "average_relevance": avg_relevance,
        "irrelevant_steps": [s for s in step_scores if s["relevance"] < 0.5]
    }

3. Action Correctness

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
def evaluate_action_correctness(trajectory: AgentTrajectory) -> Dict:
    """Evaluate if actions were correct for the situation."""

    prompt = f"""Evaluate the correctness of each action in this agent trajectory.

Task: {trajectory.task}

Trajectory:
{json.dumps([{
    "thought": s.thought,
    "action": s.action,
    "input": s.action_input,
    "result": s.observation
} for s in trajectory.steps], indent=2)}

Final Answer: {trajectory.final_answer}

For each step, evaluate:
1. Was the action appropriate given the thought?
2. Was the action input correct?
3. Did the action produce expected results?

Return JSON:
{{
    "step_evaluations": [
        {{"step": 1, "correct": true/false, "issue": "..."}}
    ],
    "overall_correctness": 0.0-1.0,
    "critical_errors": ["..."]
}}"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

4. Goal Achievement

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
def evaluate_goal_achievement(trajectory: AgentTrajectory) -> Dict:
    """Evaluate if the agent achieved its goal."""

    prompt = f"""Evaluate if this agent successfully achieved its goal.

Task: {trajectory.task}
Final Answer: {trajectory.final_answer}

Evaluate:
1. Does the final answer address the task? (0-1)
2. Is the answer complete? (0-1)
3. Is the answer correct? (0-1)

Return JSON:
{{
    "addresses_task": 0.0-1.0,
    "completeness": 0.0-1.0,
    "correctness": 0.0-1.0,
    "overall_achievement": 0.0-1.0,
    "missing_elements": ["..."]
}}"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    return json.loads(response.choices[0].message.content)

Complete Trajectory Evaluator

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
class TrajectoryEvaluator:
    """Comprehensive trajectory evaluation."""

    def __init__(self):
        self.client = OpenAI()

    def evaluate(self, trajectory: AgentTrajectory, optimal_steps: int = None) -> Dict:
        """Run full trajectory evaluation."""

        results = {
            "trajectory_summary": {
                "task": trajectory.task,
                "total_steps": len(trajectory.steps),
                "success": trajectory.success,
                "final_answer": trajectory.final_answer[:200]
            }
        }

        # Efficiency
        if optimal_steps:
            results["efficiency"] = evaluate_efficiency(trajectory, optimal_steps)

        # Step relevance
        results["relevance"] = evaluate_all_steps_relevance(trajectory)

        # Action correctness
        results["correctness"] = evaluate_action_correctness(trajectory)

        # Goal achievement
        results["achievement"] = evaluate_goal_achievement(trajectory)

        # Overall score
        results["overall_score"] = self._calculate_overall_score(results)

        return results

    def _calculate_overall_score(self, results: Dict) -> float:
        """Calculate weighted overall score."""

        scores = []
        weights = []

        if "efficiency" in results:
            scores.append(results["efficiency"]["efficiency_score"])
            weights.append(0.2)

        if "relevance" in results:
            scores.append(results["relevance"]["average_relevance"])
            weights.append(0.2)

        if "correctness" in results:
            scores.append(results["correctness"]["overall_correctness"])
            weights.append(0.3)

        if "achievement" in results:
            scores.append(results["achievement"]["overall_achievement"])
            weights.append(0.3)

        if not scores:
            return 0.0

        return sum(s * w for s, w in zip(scores, weights)) / sum(weights)

    def format_report(self, results: Dict) -> str:
        """Format evaluation results as readable report."""

        report = f"""
╔══════════════════════════════════════════════════════════════╗
║                 TRAJECTORY EVALUATION REPORT                  ║
╚══════════════════════════════════════════════════════════════╝

📋 Task: {results['trajectory_summary']['task']}

📊 OVERALL SCORE: {results['overall_score']:.1%}

───────────────────────────────────────────────────────────────
📈 METRICS BREAKDOWN
───────────────────────────────────────────────────────────────
"""

        if "efficiency" in results:
            eff = results["efficiency"]
            report += f"""
⚡ Efficiency: {eff['efficiency_score']:.1%}
   • Steps taken: {eff['actual_steps']}
   • Optimal steps: {eff['optimal_steps']}
   • Excess steps: {eff['excess_steps']}
"""

        if "relevance" in results:
            rel = results["relevance"]
            report += f"""
🎯 Relevance: {rel['average_relevance']:.1%}
   • Irrelevant steps: {len(rel['irrelevant_steps'])}
"""

        if "correctness" in results:
            cor = results["correctness"]
            report += f"""
✅ Correctness: {cor['overall_correctness']:.1%}
   • Critical errors: {len(cor.get('critical_errors', []))}
"""

        if "achievement" in results:
            ach = results["achievement"]
            report += f"""
🏆 Goal Achievement: {ach['overall_achievement']:.1%}
   • Addresses task: {ach['addresses_task']:.1%}
   • Completeness: {ach['completeness']:.1%}
   • Correctness: {ach['correctness']:.1%}
"""

        return report

# Usage
evaluator = TrajectoryEvaluator()
results = evaluator.evaluate(trajectory, optimal_steps=2)
print(evaluator.format_report(results))

Comparing Trajectories

Compare different agent approaches:

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
def compare_trajectories(trajectories: List[AgentTrajectory], optimal_steps: int) -> Dict:
    """Compare multiple trajectories for the same task."""

    evaluator = TrajectoryEvaluator()
    evaluations = []

    for i, traj in enumerate(trajectories):
        eval_result = evaluator.evaluate(traj, optimal_steps)
        evaluations.append({
            "trajectory_id": i + 1,
            "steps": len(traj.steps),
            "overall_score": eval_result["overall_score"],
            "success": traj.success
        })

    # Find best
    best = max(evaluations, key=lambda x: x["overall_score"])

    return {
        "comparisons": evaluations,
        "best_trajectory": best["trajectory_id"],
        "best_score": best["overall_score"],
        "average_score": sum(e["overall_score"] for e in evaluations) / len(evaluations)
    }

Summary


Quick Reference

# script_id: day_061_trajectory_evaluation/trajectory_evaluator
# Capture trajectory
trajectory = AgentTrajectory(task="...")
trajectory.add_step(thought="...", action="...", action_input={}, observation="...")
trajectory.complete(final_answer="...", success=True)

# Evaluate
evaluator = TrajectoryEvaluator()
results = evaluator.evaluate(trajectory, optimal_steps=3)

# Key metrics
print(f"Efficiency: {results['efficiency']['efficiency_score']:.1%}")
print(f"Relevance: {results['relevance']['average_relevance']:.1%}")
print(f"Overall: {results['overall_score']:.1%}")

What's Next?

Now let's learn about Security & Guardrails - protecting your agents from attacks and misuse!