Skip to content

autoevals Integration

autoevals is a library of evaluators from Braintrust. This guide shows how to use autoevals evaluators as output guardrails for semantic validation.

Terminal window
pip install pydantic-ai-guardrails autoevals
import asyncio
from pydantic_ai import Agent
from pydantic_ai_guardrails import GuardedAgent, GuardrailResult, OutputGuardrail
def factuality_guardrail(threshold: float = 0.7) -> OutputGuardrail:
"""Create a factuality guardrail using autoevals."""
from autoevals.llm import Factuality
evaluator = Factuality()
async def _validate(output: str, **kwargs) -> GuardrailResult:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None, evaluator, output, None, kwargs.get('input_context')
)
if result.score < threshold:
return {
'tripwire_triggered': True,
'message': f'Factuality score {result.score:.2f} < {threshold}',
'severity': 'high',
'suggestion': f'Improve accuracy. {result.metadata.get("rationale", "")}',
}
return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='autoevals.factuality')
guarded_agent = GuardedAgent(
Agent('openai:gpt-4o'),
output_guardrails=[factuality_guardrail(threshold=0.7)],
max_retries=2,
)

autoevals provides many evaluators you can wrap:

EvaluatorDescription
FactualityChecks factual consistency
ClosedQAEvaluates answer correctness
BattleCompares two responses
SummaryEvaluates summary quality
EvaluatorDescription
ModerationContent moderation (OpenAI)
HumorEvaluates humor
SecuritySecurity analysis
EvaluatorDescription
LevenshteinEdit distance
ExactMatchExact string match

Verify responses are factually consistent:

from autoevals.llm import Factuality
def factuality_guardrail(
threshold: float = 0.7,
model: str = 'gpt-4-turbo-preview',
) -> OutputGuardrail:
evaluator = Factuality(model=model)
async def _validate(
output: str,
*,
expected: str | None = None,
input_context: str | None = None,
**kwargs,
) -> GuardrailResult:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None, evaluator, output, expected, input_context
)
if result.score < threshold:
return {
'tripwire_triggered': True,
'message': f'Factuality: {result.score:.2f} < {threshold}',
'severity': 'high',
'suggestion': result.metadata.get('rationale', 'Improve factual accuracy'),
'metadata': {
'score': result.score,
'rationale': result.metadata.get('rationale'),
},
}
return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='factuality')

Use OpenAI’s moderation API:

from autoevals.moderation import Moderation
def moderation_guardrail() -> OutputGuardrail:
evaluator = Moderation()
async def _validate(output: str, **kwargs) -> GuardrailResult:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, evaluator, output)
if result.score < 1.0: # Any flagged content
return {
'tripwire_triggered': True,
'message': 'Content flagged by moderation',
'severity': 'critical',
'metadata': result.metadata,
}
return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='moderation')

Validate answers against known correct answers:

from autoevals.llm import ClosedQA
def answer_correctness_guardrail(
expected_answer: str,
threshold: float = 0.8,
) -> OutputGuardrail:
evaluator = ClosedQA()
async def _validate(output: str, **kwargs) -> GuardrailResult:
input_context = kwargs.get('input_context', '')
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None, evaluator, output, expected_answer, input_context
)
if result.score < threshold:
return {
'tripwire_triggered': True,
'message': f'Answer correctness: {result.score:.2f}',
'severity': 'medium',
'suggestion': 'Align response with expected answer',
}
return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='answer_correctness')

autoevals supports custom models, including local Ollama:

import os
os.environ['OPENAI_API_BASE'] = 'http://localhost:11434/v1'
os.environ['OPENAI_API_KEY'] = 'ollama'
from autoevals.llm import Factuality
# Uses local Ollama model
evaluator = Factuality(model='llama3')
import asyncio
from pydantic_ai import Agent
from autoevals.llm import Factuality
from autoevals.moderation import Moderation
from pydantic_ai_guardrails import (
GuardedAgent,
GuardrailResult,
OutputGuardrail,
OutputGuardrailViolation,
)
def factuality_guard(threshold: float = 0.7) -> OutputGuardrail:
evaluator = Factuality()
async def validate(output: str, **kwargs) -> GuardrailResult:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None, evaluator, output, None, kwargs.get('input_context')
)
if result.score < threshold:
return {
'tripwire_triggered': True,
'message': f'Low factuality: {result.score:.2f}',
'severity': 'high',
'suggestion': result.metadata.get('rationale', ''),
}
return {'tripwire_triggered': False}
return OutputGuardrail(validate, name='factuality')
def moderation_guard() -> OutputGuardrail:
evaluator = Moderation()
async def validate(output: str, **kwargs) -> GuardrailResult:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, evaluator, output)
if result.score < 1.0:
return {
'tripwire_triggered': True,
'message': 'Moderation flagged',
'severity': 'critical',
}
return {'tripwire_triggered': False}
return OutputGuardrail(validate, name='moderation')
async def main():
agent = Agent(
'openai:gpt-4o',
system_prompt='Answer questions accurately and helpfully.',
)
guarded_agent = GuardedAgent(
agent,
output_guardrails=[
factuality_guard(threshold=0.7),
moderation_guard(),
],
max_retries=2, # Auto-retry on low factuality
)
test_queries = [
'What year did World War II end?',
'Who was the first person on Mars?', # May fail factuality
]
for query in test_queries:
print(f'\nQuery: {query}')
try:
result = await guarded_agent.run(query)
print(f'Response: {result.output}')
except OutputGuardrailViolation as e:
print(f'Blocked: {e.result.get("message")}')
if __name__ == '__main__':
asyncio.run(main())

Use autoevals for RAG (Retrieval-Augmented Generation) quality:

from autoevals.ragas import (
AnswerRelevancy,
ContextPrecision,
ContextRecall,
Faithfulness,
)
def rag_faithfulness_guardrail(threshold: float = 0.7) -> OutputGuardrail:
"""Ensure response is faithful to retrieved context."""
evaluator = Faithfulness()
async def _validate(output: str, **kwargs) -> GuardrailResult:
context = kwargs.get('context', [])
question = kwargs.get('input_context', '')
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
evaluator,
output, # answer
None, # expected (not used)
{
'input': question,
'context': context,
},
)
if result.score < threshold:
return {
'tripwire_triggered': True,
'message': f'Faithfulness: {result.score:.2f} < {threshold}',
'severity': 'high',
'suggestion': 'Response should be grounded in the provided context',
}
return {'tripwire_triggered': False}
return OutputGuardrail(_validate, name='rag_faithfulness')

Comparison: autoevals vs Built-in LLM Judge

Section titled “Comparison: autoevals vs Built-in LLM Judge”
FeatureautoevalsBuilt-in LLM Judge
Evaluator varietyMany specializedGeneral-purpose
RAG supportYes (RAGAS)No
FactualitySpecializedVia rubric
Local modelsYes (Ollama)Yes
SetupSeparate installBuilt-in