Description

LLM Judges

LLM Judges are evaluators that are powered by LLMs. You can create LLM Judges using the Evaluators class with your own prompts and scoring rubrics.

Here's an example that uses GPT-4o-mini as a LLM judge to score whether the model response is similar to the gold answer. We use the following prompt to GPT-4o-mini:

Text
 
Given the QUESTION and GOLD ANSWER, is the RESPONSE correct when compared to the GOLD ANSWER?
Your score should be 0 (False) or 1 (True). You must respond with the following JSON:
{
  score: <score>
}
 
QUESTION: {row.evaluated_model_input}
RESPONSE: {row.evaluated_model_output}
GOLD ANSWER: {row.evaluated_model_gold_answer}

In this case, the Row object contains all the data needed for evaluation (stored in logs). We define the LLMJudge by inheriting from the Evaluator base class:

Text
 
from patronus import Client, Row, Evaluator, EvaluationResult
from openai import OpenAI
 
 
oai = OpenAI()
cli = Client()
 
class LLMJudge(Evaluator):
    """User defined LLM judge that evaluates model outputs against gold answers.
    Uses a LLM to score responses on a binary scale (0 or 1) based on correctness.
    Responses are considered passing if their score meets or exceeds the pass_threshold.
    """
    def __init__(self, pass_threshold: float):
        self.pass_threshold = pass_threshold
        super().__init__()
 
    def evaluate(self, row: Row) -> EvaluationResult:
        model = "gpt-4o-mini"
        evaluation_prompt = """
        Given the QUESTION and GOLD ANSWER, is the RESPONSE correct when compared to the GOLD ANSWER?
        Your score should be 0 (False) or 1 (True). You must respond with the following JSON:
        {
            score: <score>
        }
 
        QUESTION: {row.evaluated_model_input}
        RESPONSE: {row.evaluated_model_output}
        GOLD ANSWER: {row.evaluated_model_gold_answer}
        """
 
        evaluation_result = (
            oai.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "Your task is to score AI responses against correct responses (gold answers)."},
                    {"role": "user", "content": evaluation_prompt},
                ],
                temperature=0,
                response_format={"type": "json_object"},
            )
            .choices[0]
            .message.content
        )
        # Parse the JSON response to extract score
        try:
            score = json.loads(evaluation_result)["score"]
        except (json.JSONDecodeError, KeyError):
            score = 0
 
        return EvaluationResult(
            score_raw=score,
            pass_=score >= self.pass_threshold,
        )

Full Code Example (Experiment)

Below is the full code for using the LLM judge in an experiment with the SimpleQA dataset.

Text
from patronus import Client, Row, Evaluator, Dataset, EvaluationResult, read_jsonl
from openai import OpenAI
import json
 
 
oai = OpenAI()
cli = Client()
 
async def load_random_subset() -> Dataset:
    """Load a random subset of samples from a remote dataset.
 
    Returns:
        Dataset: A dataset containing a random subset of samples from the remote dataset.
    """
    loader = cli.remote_dataset("d-62ikqnjm461p9cod")
    dataset = await loader.load()
    subset = dataset.df.sample(n=10)
    return Dataset.from_dataframe(subset, dataset_id="random-subset")
 
 
class LLMJudge(Evaluator):
    """User defined LLM judge that evaluates model outputs against gold answers.
    Uses a LLM to score responses on a binary scale (0 or 1) based on correctness.
    Responses are considered passing if their score meets or exceeds the pass_threshold.
    """
    def __init__(self, pass_threshold: float):
        self.pass_threshold = pass_threshold
        super().__init__()
 
    def evaluate(self, row: Row) -> EvaluationResult:
        model = "gpt-4o-mini"
        evaluation_prompt = """
        Given the QUESTION and GOLD ANSWER, is the RESPONSE correct when compared to the GOLD ANSWER?
        Your score should be 0 (False) or 1 (True). You must respond with the following JSON:
        {
            score: <score>
        }
 
        QUESTION: {row.evaluated_model_input}
        RESPONSE: {row.evaluated_model_output}
        GOLD ANSWER: {row.evaluated_model_gold_answer}
        """
 
        evaluation_result = (
            oai.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "Your task is to score AI responses against correct responses (gold answers)."},
                    {"role": "user", "content": evaluation_prompt},
                ],
                temperature=0,
                response_format={"type": "json_object"},
            )
            .choices[0]
            .message.content
        )
        # Parse the JSON response to extract score
        try:
            score = json.loads(evaluation_result)["score"]
        except (json.JSONDecodeError, KeyError):
            score = 0
 
        return EvaluationResult(
            score_raw=score,
            pass_=score >= self.pass_threshold,
        )
 
# Loading a remote Patronus evaluator
fuzzy_match = cli.remote_evaluator("judge-small", "patronus:fuzzy-match")
 
# The framework will handle loading automatically when passed to an experiment
results = cli.experiment(
    "Tutorial",
    dataset=load_random_subset,
    evaluators=[fuzzy_match, LLMJudge(1.0)],
    tags={"dataset_type": "simple_qa", "model": "llama"},
    experiment_name="Simple QA Llama Experiment",
)
df = results.to_dataframe()
df.to_json("results.json", orient="records")

See Using Evaluators in Logging for how to register LLM judges for logging and real time monitoring.

On this page