The main entry point for the framework is the run_experiment() function:
from patronus.experiments import run_experimentexperiment = run_experiment( dataset=my_dataset, # Required: What to evaluate task=my_task_function, # Optional: How to process inputs evaluators=[my_evaluator], # Required: How to assess outputs tags={"dataset-version": "v1.0"}, # Optional: Tags for the experiment max_concurrency=10, # Optional: Control parallel execution project_name="My Project", # Optional: Override the global project name experiment_name="Test Run" # Optional: Name this experiment run)
from patronus.evals import RemoteEvaluatorfrom patronus.experiments import run_experimentdataset = [ { "task_input": "What is the capital of France?", "gold_answer": "Paris" }, { "task_input": "Who wrote Romeo and Juliet?", "gold_answer": "William Shakespeare" }]# Define a task (in a real scenario, this would call an LLM)def answer_question(row, **kwargs): if "France" in row.task_input: return "The capital of France is Paris." return "I don't know the answer to that question."run_experiment( dataset=dataset, task=answer_question, evaluators=[ # Use a Patronus-managed evaluator RemoteEvaluator("judge", "patronus:fuzzy-match"), ], tags={"model": "simulated", "version": "v1"})