Module livekit.agents.evals.evaluation
Classes
class EvaluationResult (judgments: dict[str, JudgmentResult] = <factory>)-
Expand source code
@dataclass class EvaluationResult: """Result of evaluating a conversation with a group of judges.""" judgments: dict[str, JudgmentResult] = field(default_factory=dict) """Individual judgment results keyed by judge name.""" @property def score(self) -> float: """Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0.""" if not self.judgments: return 0.0 total = 0.0 for j in self.judgments.values(): if j.passed: total += 1.0 elif j.uncertain: total += 0.5 return total / len(self.judgments) @property def all_passed(self) -> bool: """True if all judgments passed. Maybes count as not passed.""" return all(j.passed for j in self.judgments.values()) @property def any_passed(self) -> bool: """True if at least one judgment passed.""" return any(j.passed for j in self.judgments.values()) @property def majority_passed(self) -> bool: """True if more than half of the judgments passed.""" if not self.judgments: return True return self.score > len(self.judgments) / 2 @property def none_failed(self) -> bool: """True if no judgments explicitly failed. Maybes are allowed.""" return not any(j.failed for j in self.judgments.values())Result of evaluating a conversation with a group of judges.
Instance variables
prop all_passed : bool-
Expand source code
@property def all_passed(self) -> bool: """True if all judgments passed. Maybes count as not passed.""" return all(j.passed for j in self.judgments.values())True if all judgments passed. Maybes count as not passed.
prop any_passed : bool-
Expand source code
@property def any_passed(self) -> bool: """True if at least one judgment passed.""" return any(j.passed for j in self.judgments.values())True if at least one judgment passed.
var judgments : dict[str, JudgmentResult]-
Individual judgment results keyed by judge name.
prop majority_passed : bool-
Expand source code
@property def majority_passed(self) -> bool: """True if more than half of the judgments passed.""" if not self.judgments: return True return self.score > len(self.judgments) / 2True if more than half of the judgments passed.
prop none_failed : bool-
Expand source code
@property def none_failed(self) -> bool: """True if no judgments explicitly failed. Maybes are allowed.""" return not any(j.failed for j in self.judgments.values())True if no judgments explicitly failed. Maybes are allowed.
prop score : float-
Expand source code
@property def score(self) -> float: """Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0.""" if not self.judgments: return 0.0 total = 0.0 for j in self.judgments.values(): if j.passed: total += 1.0 elif j.uncertain: total += 0.5 return total / len(self.judgments)Score from 0.0 to 1.0. Pass=1, maybe=0.5, fail=0.
class Evaluator (*args, **kwargs)-
Expand source code
class Evaluator(Protocol): """Protocol for any object that can evaluate a conversation.""" @property def name(self) -> str: """Name identifying this evaluator.""" ... async def evaluate( self, *, chat_ctx: ChatContext, reference: ChatContext | None = None, llm: LLM | None = None, ) -> JudgmentResult: ...Protocol for any object that can evaluate a conversation.
Ancestors
- typing.Protocol
- typing.Generic
Instance variables
prop name : str-
Expand source code
@property def name(self) -> str: """Name identifying this evaluator.""" ...Name identifying this evaluator.
Methods
async def evaluate(self,
*,
chat_ctx: ChatContext,
reference: ChatContext | None = None,
llm: LLM | None = None) ‑> JudgmentResult-
Expand source code
async def evaluate( self, *, chat_ctx: ChatContext, reference: ChatContext | None = None, llm: LLM | None = None, ) -> JudgmentResult: ...
class JudgeGroup (*,
llm: LLM | LLMModels | str,
judges: list[Evaluator] | None = None)-
Expand source code
class JudgeGroup: """A group of judges that evaluate conversations together. Automatically tags the session with judgment results when called within a job context. Example: ```python async def on_session_end(ctx: JobContext) -> None: judges = JudgeGroup( llm="openai/gpt-4o-mini", judges=[ task_completion_judge(), accuracy_judge(), ], ) report = ctx.make_session_report() result = await judges.evaluate(report.chat_history) # Results are automatically tagged to the session ``` """ def __init__( self, *, llm: LLM | LLMModels | str, judges: list[Evaluator] | None = None, ) -> None: """Initialize a JudgeGroup. Args: llm: The LLM to use for evaluation. Can be an LLM instance or a model string like "openai/gpt-4o-mini" (uses LiveKit inference gateway). judges: The judges to run during evaluation. """ if isinstance(llm, str): from ..inference import LLM as InferenceLLM self._llm: LLM = InferenceLLM(llm) else: self._llm = llm self._judges = judges or [] @property def llm(self) -> LLM: """The LLM used for evaluation.""" return self._llm @property def judges(self) -> list[Evaluator]: """The judges to run during evaluation.""" return self._judges async def evaluate( self, chat_ctx: ChatContext, *, reference: ChatContext | None = None, ) -> EvaluationResult: """Evaluate a conversation with all judges. Automatically tags the session with results when called within a job context. Args: chat_ctx: The conversation to evaluate. reference: Optional reference conversation for comparison. Returns: EvaluationResult containing all judgment results. """ from ..job import get_job_context from ..log import logger # Run all judges concurrently async def run_judge(judge: Evaluator) -> tuple[str, JudgmentResult | BaseException]: try: result = await judge.evaluate( chat_ctx=chat_ctx, reference=reference, llm=self._llm, ) return judge.name, result except Exception as e: logger.warning(f"Judge '{judge.name}' failed: {e}") return judge.name, e results = await asyncio.gather(*[run_judge(j) for j in self._judges]) # Filter out failed judges judgments: dict[str, JudgmentResult] = {} for name, result in results: if isinstance(result, JudgmentResult): judgments[name] = result evaluation_result = EvaluationResult(judgments=judgments) if _evals_verbose: print("\n+ JudgeGroup evaluation results:") for name, result in results: if isinstance(result, JudgmentResult): print(f" [{name}] verdict={result.verdict}") print(f" reasoning: {result.reasoning}\n") else: print(f" [{name}] ERROR: {result}\n") # Auto-tag if running within a job context try: ctx = get_job_context() ctx.tagger._evaluation(evaluation_result) except RuntimeError: pass # Not in a job context, skip tagging return evaluation_resultA group of judges that evaluate conversations together.
Automatically tags the session with judgment results when called within a job context.
Example
async def on_session_end(ctx: JobContext) -> None: judges = JudgeGroup( llm="openai/gpt-4o-mini", judges=[ task_completion_judge(), accuracy_judge(), ], ) report = ctx.make_session_report() result = await judges.evaluate(report.chat_history) # Results are automatically tagged to the sessionInitialize a JudgeGroup.
Args
llm- The LLM to use for evaluation. Can be an LLM instance or a model string like "openai/gpt-4o-mini" (uses LiveKit inference gateway).
judges- The judges to run during evaluation.
Instance variables
prop judges : list[Evaluator]-
Expand source code
@property def judges(self) -> list[Evaluator]: """The judges to run during evaluation.""" return self._judgesThe judges to run during evaluation.
prop llm : LLM-
Expand source code
@property def llm(self) -> LLM: """The LLM used for evaluation.""" return self._llmThe LLM used for evaluation.
Methods
async def evaluate(self, chat_ctx: ChatContext, *, reference: ChatContext | None = None) ‑> EvaluationResult-
Expand source code
async def evaluate( self, chat_ctx: ChatContext, *, reference: ChatContext | None = None, ) -> EvaluationResult: """Evaluate a conversation with all judges. Automatically tags the session with results when called within a job context. Args: chat_ctx: The conversation to evaluate. reference: Optional reference conversation for comparison. Returns: EvaluationResult containing all judgment results. """ from ..job import get_job_context from ..log import logger # Run all judges concurrently async def run_judge(judge: Evaluator) -> tuple[str, JudgmentResult | BaseException]: try: result = await judge.evaluate( chat_ctx=chat_ctx, reference=reference, llm=self._llm, ) return judge.name, result except Exception as e: logger.warning(f"Judge '{judge.name}' failed: {e}") return judge.name, e results = await asyncio.gather(*[run_judge(j) for j in self._judges]) # Filter out failed judges judgments: dict[str, JudgmentResult] = {} for name, result in results: if isinstance(result, JudgmentResult): judgments[name] = result evaluation_result = EvaluationResult(judgments=judgments) if _evals_verbose: print("\n+ JudgeGroup evaluation results:") for name, result in results: if isinstance(result, JudgmentResult): print(f" [{name}] verdict={result.verdict}") print(f" reasoning: {result.reasoning}\n") else: print(f" [{name}] ERROR: {result}\n") # Auto-tag if running within a job context try: ctx = get_job_context() ctx.tagger._evaluation(evaluation_result) except RuntimeError: pass # Not in a job context, skip tagging return evaluation_resultEvaluate a conversation with all judges.
Automatically tags the session with results when called within a job context.
Args
chat_ctx- The conversation to evaluate.
reference- Optional reference conversation for comparison.
Returns
EvaluationResult containing all judgment results.