autoevals Integration
autoevals is a library of evaluators from Braintrust. This guide shows how to use autoevals evaluators as output guardrails for semantic validation.
Installation
Section titled “Installation”pip install pydantic-ai-guardrails autoevalsQuick Start
Section titled “Quick Start”import asynciofrom pydantic_ai import Agentfrom pydantic_ai_guardrails import GuardedAgent, GuardrailResult, OutputGuardrail
def factuality_guardrail(threshold: float = 0.7) -> OutputGuardrail: """Create a factuality guardrail using autoevals.""" from autoevals.llm import Factuality
evaluator = Factuality()
async def _validate(output: str, **kwargs) -> GuardrailResult: loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, evaluator, output, None, kwargs.get('input_context') )
if result.score < threshold: return { 'tripwire_triggered': True, 'message': f'Factuality score {result.score:.2f} < {threshold}', 'severity': 'high', 'suggestion': f'Improve accuracy. {result.metadata.get("rationale", "")}', } return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='autoevals.factuality')
guarded_agent = GuardedAgent( Agent('openai:gpt-4o'), output_guardrails=[factuality_guardrail(threshold=0.7)], max_retries=2,)Available Evaluators
Section titled “Available Evaluators”autoevals provides many evaluators you can wrap:
LLM-Based Evaluators
Section titled “LLM-Based Evaluators”| Evaluator | Description |
|---|---|
Factuality | Checks factual consistency |
ClosedQA | Evaluates answer correctness |
Battle | Compares two responses |
Summary | Evaluates summary quality |
Semantic Evaluators
Section titled “Semantic Evaluators”| Evaluator | Description |
|---|---|
Moderation | Content moderation (OpenAI) |
Humor | Evaluates humor |
Security | Security analysis |
String Matching
Section titled “String Matching”| Evaluator | Description |
|---|---|
Levenshtein | Edit distance |
ExactMatch | Exact string match |
Evaluator Wrappers
Section titled “Evaluator Wrappers”Factuality Guardrail
Section titled “Factuality Guardrail”Verify responses are factually consistent:
from autoevals.llm import Factuality
def factuality_guardrail( threshold: float = 0.7, model: str = 'gpt-4-turbo-preview',) -> OutputGuardrail: evaluator = Factuality(model=model)
async def _validate( output: str, *, expected: str | None = None, input_context: str | None = None, **kwargs, ) -> GuardrailResult: loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, evaluator, output, expected, input_context )
if result.score < threshold: return { 'tripwire_triggered': True, 'message': f'Factuality: {result.score:.2f} < {threshold}', 'severity': 'high', 'suggestion': result.metadata.get('rationale', 'Improve factual accuracy'), 'metadata': { 'score': result.score, 'rationale': result.metadata.get('rationale'), }, } return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='factuality')Moderation Guardrail
Section titled “Moderation Guardrail”Use OpenAI’s moderation API:
from autoevals.moderation import Moderation
def moderation_guardrail() -> OutputGuardrail: evaluator = Moderation()
async def _validate(output: str, **kwargs) -> GuardrailResult: loop = asyncio.get_event_loop() result = await loop.run_in_executor(None, evaluator, output)
if result.score < 1.0: # Any flagged content return { 'tripwire_triggered': True, 'message': 'Content flagged by moderation', 'severity': 'critical', 'metadata': result.metadata, } return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='moderation')ClosedQA Guardrail
Section titled “ClosedQA Guardrail”Validate answers against known correct answers:
from autoevals.llm import ClosedQA
def answer_correctness_guardrail( expected_answer: str, threshold: float = 0.8,) -> OutputGuardrail: evaluator = ClosedQA()
async def _validate(output: str, **kwargs) -> GuardrailResult: input_context = kwargs.get('input_context', '') loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, evaluator, output, expected_answer, input_context )
if result.score < threshold: return { 'tripwire_triggered': True, 'message': f'Answer correctness: {result.score:.2f}', 'severity': 'medium', 'suggestion': 'Align response with expected answer', } return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='answer_correctness')Using with Ollama
Section titled “Using with Ollama”autoevals supports custom models, including local Ollama:
import osos.environ['OPENAI_API_BASE'] = 'http://localhost:11434/v1'os.environ['OPENAI_API_KEY'] = 'ollama'
from autoevals.llm import Factuality
# Uses local Ollama modelevaluator = Factuality(model='llama3')Complete Example
Section titled “Complete Example”import asynciofrom pydantic_ai import Agentfrom autoevals.llm import Factualityfrom autoevals.moderation import Moderation
from pydantic_ai_guardrails import ( GuardedAgent, GuardrailResult, OutputGuardrail, OutputGuardrailViolation,)
def factuality_guard(threshold: float = 0.7) -> OutputGuardrail: evaluator = Factuality()
async def validate(output: str, **kwargs) -> GuardrailResult: loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, evaluator, output, None, kwargs.get('input_context') )
if result.score < threshold: return { 'tripwire_triggered': True, 'message': f'Low factuality: {result.score:.2f}', 'severity': 'high', 'suggestion': result.metadata.get('rationale', ''), } return {'tripwire_triggered': False}
return OutputGuardrail(validate, name='factuality')
def moderation_guard() -> OutputGuardrail: evaluator = Moderation()
async def validate(output: str, **kwargs) -> GuardrailResult: loop = asyncio.get_event_loop() result = await loop.run_in_executor(None, evaluator, output)
if result.score < 1.0: return { 'tripwire_triggered': True, 'message': 'Moderation flagged', 'severity': 'critical', } return {'tripwire_triggered': False}
return OutputGuardrail(validate, name='moderation')
async def main(): agent = Agent( 'openai:gpt-4o', system_prompt='Answer questions accurately and helpfully.', )
guarded_agent = GuardedAgent( agent, output_guardrails=[ factuality_guard(threshold=0.7), moderation_guard(), ], max_retries=2, # Auto-retry on low factuality )
test_queries = [ 'What year did World War II end?', 'Who was the first person on Mars?', # May fail factuality ]
for query in test_queries: print(f'\nQuery: {query}') try: result = await guarded_agent.run(query) print(f'Response: {result.output}') except OutputGuardrailViolation as e: print(f'Blocked: {e.result.get("message")}')
if __name__ == '__main__': asyncio.run(main())RAG Evaluation
Section titled “RAG Evaluation”Use autoevals for RAG (Retrieval-Augmented Generation) quality:
from autoevals.ragas import ( AnswerRelevancy, ContextPrecision, ContextRecall, Faithfulness,)
def rag_faithfulness_guardrail(threshold: float = 0.7) -> OutputGuardrail: """Ensure response is faithful to retrieved context.""" evaluator = Faithfulness()
async def _validate(output: str, **kwargs) -> GuardrailResult: context = kwargs.get('context', []) question = kwargs.get('input_context', '')
loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, evaluator, output, # answer None, # expected (not used) { 'input': question, 'context': context, }, )
if result.score < threshold: return { 'tripwire_triggered': True, 'message': f'Faithfulness: {result.score:.2f} < {threshold}', 'severity': 'high', 'suggestion': 'Response should be grounded in the provided context', } return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='rag_faithfulness')Comparison: autoevals vs Built-in LLM Judge
Section titled “Comparison: autoevals vs Built-in LLM Judge”| Feature | autoevals | Built-in LLM Judge |
|---|---|---|
| Evaluator variety | Many specialized | General-purpose |
| RAG support | Yes (RAGAS) | No |
| Factuality | Specialized | Via rubric |
| Local models | Yes (Ollama) | Yes |
| Setup | Separate install | Built-in |