from patronus import Client, Row, Evaluator, Dataset, EvaluationResult, read_jsonl
from openai import OpenAI
import json
oai = OpenAI()
cli = Client()
async def load_random_subset() -> Dataset:
"""Load a random subset of samples from a remote dataset.
Returns:
Dataset: A dataset containing a random subset of samples from the remote dataset.
"""
loader = cli.remote_dataset("d-62ikqnjm461p9cod")
dataset = await loader.load()
subset = dataset.df.sample(n=10)
return Dataset.from_dataframe(subset, dataset_id="random-subset")
class LLMJudge(Evaluator):
"""User defined LLM judge that evaluates model outputs against gold answers.
Uses a LLM to score responses on a binary scale (0 or 1) based on correctness.
Responses are considered passing if their score meets or exceeds the pass_threshold.
"""
def __init__(self, pass_threshold: float):
self.pass_threshold = pass_threshold
super().__init__()
def evaluate(self, row: Row) -> EvaluationResult:
model = "gpt-4o-mini"
evaluation_prompt = """
Given the QUESTION and GOLD ANSWER, is the RESPONSE correct when compared to the GOLD ANSWER?
Your score should be 0 (False) or 1 (True). You must respond with the following JSON:
{
score: <score>
}
QUESTION: {row.evaluated_model_input}
RESPONSE: {row.evaluated_model_output}
GOLD ANSWER: {row.evaluated_model_gold_answer}
"""
evaluation_result = (
oai.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "Your task is to score AI responses against correct responses (gold answers)."},
{"role": "user", "content": evaluation_prompt},
],
temperature=0,
response_format={"type": "json_object"},
)
.choices[0]
.message.content
)
# Parse the JSON response to extract score
try:
score = json.loads(evaluation_result)["score"]
except (json.JSONDecodeError, KeyError):
score = 0
return EvaluationResult(
score_raw=score,
pass_=score >= self.pass_threshold,
)
# Loading a remote Patronus evaluator
fuzzy_match = cli.remote_evaluator("judge-small", "patronus:fuzzy-match")
# The framework will handle loading automatically when passed to an experiment
results = cli.experiment(
"Tutorial",
dataset=load_random_subset,
evaluators=[fuzzy_match, LLMJudge(1.0)],
tags={"dataset_type": "simple_qa", "model": "llama"},
experiment_name="Simple QA Llama Experiment",
)
df = results.to_dataframe()
df.to_json("results.json", orient="records")