# Copyright (c) 2025 Microsoft Corporation.
import os
from pathlib import Path
from typing import cast
import pandas as pd
from pydantic import SecretStr
from rich import print as rich_print
from benchmark_qed.autoe.pairwise_scores import analyze_criteria, get_pairwise_scores
from benchmark_qed.autoe.reference_scores import (
get_reference_scores,
summarize_reference_scores,
)
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import (
LLMConfig,
LLMProvider,
)
from benchmark_qed.config.model.score import (
pairwise_scores_criteria,
reference_scores_criteria,
)
from benchmark_qed.llm.factory import ModelFactory
AutoE¶
import nest_asyncio
nest_asyncio.apply()
%load_ext dotenv
%dotenv
Relative Comparisons of RAG methods¶
The AutoE component automates relative comparisons of RAG methods using the LLM-as-a-judge approach. It presents an LLM with pairs of answers, along with the query and target metric, in a counterbalanced order. The model then judges whether the first answer wins, loses, or ties with the second. Aggregating these judgments across multiple queries and trials yields win rates for each method.
In the example below, we compare Vector RAG (baseline) with two competing methods: GraphRAG's Global Search and LazyGraphRAG. We use 100 synthetic questions (50 activity-local and 50 activity-global) generated from 1,397 AP News health-related articles using AutoQ. Each query is evaluated in 4 counterbalanced trials across four default metrics (comprehensiveness, diversity, empowerment, and relevance), using GPT-4.1 as the judge.
We hypothesize that GraphRAG’s Global Search, which is optimized for global questions, will outperform Vector RAG on global queries but underperform on local ones. LazyGraphRAG, a hybrid method, is expected to perform well on both.
Choosing the right LLM judge is critical: less capable models may introduce biases and yield unreliable results. A useful first step in validating a judge model is to run an A/A test — comparing a RAG method against itself. This should result in a ~0.5 win rate with no statistically significant differences.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-4.1",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=32,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config conditions for comparison
base = "vector_rag"
others = ["lazygraphrag", "graphrag_global"]
question_sets = ["activity_global", "activity_local"]
trials = 4 # number of trials to run for each combination of [query, base, other]. Trials must be an even number to support counterbalancing.
alpha = 0.05 # significance level used for statistical tests
input_dir = "./example_answers"
output_dir = Path("./output/win_rates")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# load default criteria. You can also define your own criteria as a list Criteria objects
criteria = pairwise_scores_criteria()
# run pairwise comparisons for each question set and each pair of [base, other].
all_results = []
for question_set in question_sets:
for other in others:
rich_print(f"Processing {base} vs {other} for question set: {question_set}")
result = get_pairwise_scores(
llm_client=llm_client,
llm_config=llm_config,
base_name=base,
other_name=other,
base_answers=pd.read_json(f"{input_dir}/{base}/{question_set}.json"),
other_answers=pd.read_json(f"{input_dir}/{other}/{question_set}.json"),
criteria=criteria,
trials=trials,
include_score_id_in_prompt=True,
question_id_key="question_id",
)
result["question_set"] = question_set
all_results.append(result)
# save pairwise results for each question set and pair of [base, other]
result.to_csv(
output_dir / f"{question_set}_{base}--{other}.csv",
index=False,
)
# save all pairwise results in a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "win_rates.csv", index=False)
# perform significance testing on the results
significance_test_results = analyze_criteria(
all_results_df,
alpha=alpha,
)
significance_test_results.to_csv(output_dir / "winrates_sig_tests.csv", index=False)
print_df(
cast(
pd.DataFrame,
significance_test_results[
[
"question_set",
"criteria",
"base_name",
"other_name",
"base_mean",
"other_mean",
"formatted_corrected_p_value",
]
],
),
"Win Rates Summary",
)
rich_print("Model usage statistics:")
rich_print(llm_client.get_usage())
Reference-based Scoring¶
When reference answers (such as ground truth or "gold standard" responses) are available, AutoE can evaluate RAG-generated answers against these references using metrics like correctness, completeness, or other user-defined criteria on a customizable scoring scale.
In the example below, we use the same 100 synthetic questions (50 activity-local and 50 activity-global) generated by AutoQ. Since AutoQ does not provide ground-truth answers, we use LazyGraphRAG as the reference method because it achieved the best win rates in the pairwise relative comparisons above. We then score the answers from Vector RAG against those from LazyGraphRAG using the default metrics (correctness and completeness) on a scale from 1 to 10.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-4.1",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=32,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config conditions for comparison
reference = "lazygraphrag"
generated_rags = [
"vector_rag"
] # you can add more generated RAGs to compare against the reference
question_sets = ["activity_global", "activity_local"]
trials = 4 # number of trials must be an even number to support counterbalancing
input_dir = "./example_answers"
output_dir = Path("./output/reference_scores")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# load default criteria (correctness and completeness). You can also define your own criteria as a list Criteria objects
criteria = reference_scores_criteria()
# run comparisons for each question set and each pair of [generated, reference].
all_results = []
all_summaries = []
for question_set in question_sets:
for generated in generated_rags:
rich_print(
f"Comparing {generated} vs. {reference} for question set: {question_set}"
)
result = get_reference_scores(
llm_client=llm_client,
llm_config=llm_config,
reference_answers=pd.read_json(
f"{input_dir}/{reference}/{question_set}.json"
),
generated_answers=pd.read_json(
f"{input_dir}/{generated}/{question_set}.json"
),
criteria=criteria,
trials=trials,
score_min=1,
score_max=10,
include_score_id_in_prompt=True,
question_id_key="question_id",
)
all_results.append(result)
result.to_csv(
output_dir / f"{question_set}_{reference}--{generated}.csv",
index=False,
)
summary_df = summarize_reference_scores(result)
summary_df["question_set"] = question_set
summary_df["reference"] = reference
summary_df["generated"] = generated
all_summaries.append(summary_df)
# save all results into a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "reference_scores.csv", index=False)
all_summary_df = pd.concat(all_summaries, ignore_index=True)
print_df(
all_summary_df[
["question_set", "criteria", "reference", "generated", "mean", "std"]
].reset_index(),
"Reference Scores Summary",
)
all_summary_df.to_csv(output_dir / "reference_scores_summary.csv", index=False)
Assertion-based Scoring¶
Assertion-based scoring evaluates RAG-generated answers by checking whether they contain specific factual assertions or claims that should be present according to a reference or gold standard. This approach is especially useful for tasks where the presence or absence of key facts is more important than holistic correctness or completeness.
In this example, we use the same synthetic questions and answers as before. We assume that a set of reference assertions has been extracted for each question (e.g., using an information extraction pipeline or manual annotation). The LLM judge is tasked with verifying whether each assertion is supported by the generated answer.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-4.1",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=100,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config
assertions_file = "activity_global_assertions.json"
generated_rag = "vector_rag"
pass_threshold = 0.5
trials = 4 # number of trials
input_dir = "./example_answers"
output_dir = Path("./output/assertion_scores")
if not output_dir.exists():
output_dir.mkdir(parents=True)
import numpy as np
from benchmark_qed.autoe.assertion_scores import get_assertion_scores
answers = pd.read_json(f"{input_dir}/{generated_rag}/activity_global.json")
assertions = (
pd.read_json(f"{input_dir}/{assertions_file}")
.explode("assertions")
.rename(columns={"assertions": "assertion"})
.reset_index(drop=True)
)
assertion_score = get_assertion_scores(
llm_client=llm_client,
llm_config=llm_config,
answers=answers,
assertions=assertions,
trials=4,
question_id_key="question_id",
question_text_key="question_text",
answer_text_key="answer",
)
assertion_score.to_csv(output_dir / "assertion_scores.csv", index=False)
summary_by_assertion = (
assertion_score.groupby(["question", "assertion"])
.agg(score=("score", lambda x: int(x.mean() > 0.5)), scores=("score", list))
.reset_index()
)
summary_by_question = (
summary_by_assertion.groupby(["question"])
.agg(
success=("score", lambda x: (x == 1).sum()),
fail=("score", lambda x: (x == 0).sum()),
)
.reset_index()
)
summary_by_assertion["score_mean"] = summary_by_assertion["scores"].apply(
lambda x: np.mean(x) if len(x) > 0 else 0.0
)
summary_by_assertion["score_std"] = summary_by_assertion["scores"].apply(
lambda x: np.std(x) if len(x) > 0 else 0.0
)
summary_by_assertion = summary_by_assertion.drop(columns=["scores"])
print_df(
summary_by_question,
"Assertion Scores Summary by Question",
)
failed_assertions: pd.DataFrame = cast(
pd.DataFrame, summary_by_assertion[summary_by_assertion["score"] == 0]
)
failed_assertions = failed_assertions.drop(columns=["score"])
if len(failed_assertions) > 0:
print_df(
failed_assertions,
f"[bold red]{failed_assertions.shape[0]} Failed Assertions[/bold red]",
)
rich_print(
f"[bold red]{failed_assertions.shape[0]} assertions failed. See {output_dir / 'assertion_scores.csv'} for details.[/bold red]"
)
else:
rich_print("[bold green]All assertions passed.[/bold green]")