# Copyright (c) 2025 Microsoft Corporation.
import os
from pathlib import Path
from typing import cast
import pandas as pd
from pydantic import SecretStr
from rich import print as rich_print
from benchmark_qed.autoe.assertion import (
HierarchicalMode,
load_and_normalize_hierarchical_assertions,
run_assertion_evaluation,
run_hierarchical_assertion_evaluation,
)
from benchmark_qed.autoe.pairwise import analyze_criteria, get_pairwise_scores
from benchmark_qed.autoe.reference import (
get_reference_scores,
summarize_reference_scores,
)
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import (
LLMConfig,
LLMProvider,
)
from benchmark_qed.config.model.score import (
pairwise_scores_criteria,
reference_scores_criteria,
)
from benchmark_qed.llm.factory import ModelFactory
AutoE¶
import nest_asyncio
nest_asyncio.apply()
%load_ext dotenv
%dotenv
cannot find .env file
Pairwise Comparisons of RAG Methods¶
The AutoE component automates relative comparisons of RAG methods using the LLM-as-a-judge approach. It presents an LLM with pairs of answers, along with the query and target metric, in a counterbalanced order. The model then judges whether the first answer wins, loses, or ties with the second. Aggregating these judgments across multiple queries and trials yields win rates for each method.
In the example below, we compare Vector RAG with short context (retrieves 50 text chunks) against Vector RAG with long context (retrieves 200 text chunks). We use synthetic questions generated from AP News health-related articles using AutoQ, covering data-global, data-local, and data-linked question types. Each query is evaluated in 4 counterbalanced trials across four default metrics (comprehensiveness, diversity, empowerment, and relevance), using GPT-4.1 as the judge.
Choosing the right LLM judge is critical: less capable models may introduce biases and yield unreliable results. A useful first step in validating a judge model is to run an A/A test — comparing a RAG method against itself. This should result in a ~0.5 win rate with no statistically significant differences.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-5.2",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=32,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config conditions for comparison
base = "vector_rag_short_context"
others = ["vector_rag_long_context"]
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4 # number of trials to run for each combination of [query, base, other]. Trials must be an even number to support counterbalancing.
alpha = 0.05 # significance level used for statistical tests
input_dir = "./example_answers"
output_dir = Path("./output/win_rates")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# load default criteria. You can also define your own criteria as a list Criteria objects
criteria = pairwise_scores_criteria()
# run pairwise comparisons for each question set and each pair of [base, other].
all_results = []
for question_set in question_sets:
for other in others:
rich_print(f"Processing {base} vs {other} for question set: {question_set}")
result = get_pairwise_scores(
llm_client=llm_client,
llm_config=llm_config,
base_name=base,
other_name=other,
base_answers=pd.read_json(
f"{input_dir}/{base}/{question_set}_answers.json"
),
other_answers=pd.read_json(
f"{input_dir}/{other}/{question_set}_answers.json"
),
criteria=criteria,
trials=trials,
include_score_id_in_prompt=True,
question_id_key="question_id",
question_text_key="question", # Column name in the answer files
)
result["question_set"] = question_set
all_results.append(result)
# save pairwise results for each question set and pair of [base, other]
result.to_csv(
output_dir / f"{question_set}_{base}--{other}.csv",
index=False,
)
# save all pairwise results in a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "win_rates.csv", index=False)
# perform significance testing on the results
significance_test_results = analyze_criteria(
all_results_df,
alpha=alpha,
)
significance_test_results.to_csv(output_dir / "winrates_sig_tests.csv", index=False)
print_df(
cast(
pd.DataFrame,
significance_test_results[
[
"question_set",
"criteria",
"base_name",
"other_name",
"base_mean",
"other_mean",
"formatted_corrected_p_value",
]
],
),
"Win Rates Summary",
)
rich_print("Model usage statistics:")
rich_print(llm_client.get_usage())
Reference-based Scoring¶
When reference answers (such as ground truth or "gold standard" responses) are available, AutoE can evaluate RAG-generated answers against these references using metrics like correctness, completeness, or other user-defined criteria on a customizable scoring scale.
In the example below, we use the long context version as the reference method. Note that this is not ground truth—we use it here purely to demonstrate the reference-based scoring workflow. While more context can sometimes lead to more complete answers, this relationship is not universal: it depends on the model, the specific context window, and the nature of the questions. We then score the answers from short context against those from the long context version using the default metrics (correctness and completeness) on a scale from 1 to 10.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-5.2",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=32,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config conditions for comparison
reference = (
"vector_rag_long_context" # long context as reference (more complete answers)
)
generated_rags = ["vector_rag_short_context"] # short context to evaluate
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4 # number of trials must be an even number to support counterbalancing
input_dir = "./example_answers"
output_dir = Path("./output/reference_scores")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# load default criteria (correctness and completeness). You can also define your own criteria as a list Criteria objects
criteria = reference_scores_criteria()
# run comparisons for each question set and each pair of [generated, reference].
all_results = []
all_summaries = []
for question_set in question_sets:
for generated in generated_rags:
rich_print(
f"Comparing {generated} vs. {reference} for question set: {question_set}"
)
result = get_reference_scores(
llm_client=llm_client,
llm_config=llm_config,
reference_answers=pd.read_json(
f"{input_dir}/{reference}/{question_set}_answers.json"
),
generated_answers=pd.read_json(
f"{input_dir}/{generated}/{question_set}_answers.json"
),
criteria=criteria,
trials=trials,
score_min=1,
score_max=10,
include_score_id_in_prompt=True,
question_id_key="question_id",
question_text_key="question", # Column name in the answer files
)
all_results.append(result)
result.to_csv(
output_dir / f"{question_set}_{reference}--{generated}.csv",
index=False,
)
summary_df = summarize_reference_scores(result)
summary_df["question_set"] = question_set
summary_df["reference"] = reference
summary_df["generated"] = generated
all_summaries.append(summary_df)
# save all results into a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "reference_scores.csv", index=False)
all_summary_df = pd.concat(all_summaries, ignore_index=True)
print_df(
all_summary_df[
["question_set", "criteria", "reference", "generated", "mean", "std"]
].reset_index(drop=True),
"Reference Scores Summary",
)
all_summary_df.to_csv(output_dir / "reference_scores_summary.csv", index=False)
Assertion-based Scoring¶
Assertion-based scoring evaluates RAG-generated answers by checking whether they contain specific factual assertions or claims that should be present according to a reference or gold standard. This approach is especially useful for tasks where the presence or absence of key facts is more important than holistic correctness or completeness.
Standard Assertions (for data-local and data-linked questions)¶
For data-local and data-linked questions, we use standard (flat) assertions. These are straightforward factual claims that can be independently verified against the generated answer. The LLM judge checks whether each assertion is supported by the answer.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-5.2",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=100,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config for standard assertion scoring (data-local and data-linked)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
question_sets = [
"data_local",
"data_linked",
] # Standard assertions for these question types
pass_threshold = 0.5
trials = 2
input_dir = Path("./example_answers")
output_dir = Path("./output/assertion_scores")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# Run standard assertion scoring for multiple RAG methods
# This uses run_assertion_evaluation which handles multiple RAGs and runs significance tests
results_df = run_assertion_evaluation(
llm_client=llm_client,
llm_config=llm_config,
question_sets=question_sets,
generated_rags=generated_rags,
input_dir=str(input_dir),
output_dir=output_dir,
trials=trials,
top_k_assertions=None, # Use all assertions
pass_threshold=pass_threshold,
# Assertions are in input_dir (not in RAG subdirs)
assertions_filename_template="{question_set}_assertions.json", # noqa: RUF027
# Answers are in RAG subdirs with question_set name
answers_path_template="{input_dir}/{generated_rag}/{question_set}_answers.json",
run_significance_test=True, # Run Friedman/Wilcoxon tests
significance_alpha=0.05,
significance_correction="holm",
question_text_key="question", # Column name in the answer files
answer_text_key="answer",
)
print_df(results_df, "Assertion Scoring Results Summary")
Processing question set: data_local
Using all assertions (no filtering)
Processing vector_rag_short_context for data_local
Output()
vector_rag_short_context (data_local): 2 assertions failed
vector_rag_short_context (data_local) - Overall accuracy: 0.970 (65/67), Avg question pass rate: 0.973
Processing vector_rag_long_context for data_local
Output()
vector_rag_long_context (data_local): 2 assertions failed
vector_rag_long_context (data_local) - Overall accuracy: 0.970 (65/67), Avg question pass rate: 0.973
Processing question set: data_link
Using all assertions (no filtering)
Processing vector_rag_short_context for data_link
Output()
vector_rag_short_context (data_link): 22 assertions failed
vector_rag_short_context (data_link) - Overall accuracy: 0.836 (112/134), Avg question pass rate: 0.854
Processing vector_rag_long_context for data_link
Output()
vector_rag_long_context (data_link): 13 assertions failed
vector_rag_long_context (data_link) - Overall accuracy: 0.903 (121/134), Avg question pass rate: 0.922
Overall Assertion Scores Summary by Question Set and RAG Method ┏━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ question_… ┃ rag_method ┃ total_ass… ┃ successfu… ┃ failed_a… ┃ overall_a… ┃ avg_ques… ┃ total_que… ┃ top_k_us… ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ data_link │ vector_ra… │ 134 │ 121 │ 13 │ 0.9029850… │ 0.922333… │ 50 │ all │ │ data_link │ vector_ra… │ 134 │ 112 │ 22 │ 0.8358208… │ 0.854333… │ 50 │ all │ │ data_local │ vector_ra… │ 67 │ 65 │ 2 │ 0.9701492… │ 0.973333… │ 50 │ all │ │ data_local │ vector_ra… │ 67 │ 65 │ 2 │ 0.9701492… │ 0.973333… │ 50 │ all │ └────────────┴────────────┴────────────┴────────────┴───────────┴────────────┴───────────┴────────────┴───────────┘
Assertion Accuracy Comparison (Pivot View) ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓ ┃ rag_method ┃ data_link ┃ data_local ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩ │ vector_rag_long_context │ 0.9029850746268657 │ 0.9701492537313433 │ │ vector_rag_short_context │ 0.835820895522388 │ 0.9701492537313433 │ └──────────────────────────┴────────────────────┴────────────────────┘
Statistical significance test for data_local
c:\Users\trinhha\Documents\sources\benchmark-qed\.venv\Lib\site-packages\scipy\stats\_axis_nan_policy.py:579: UserWarning: scipy.stats.shapiro: Input data has range zero. The results may not be accurate. res = hypotest_fun_out(*samples, **kwds)
Paired t-test (normal data): statistic=nan, p=nan (not significant)
No significant pairwise differences found.
Statistical significance test for data_link
Wilcoxon signed-rank test (non-normal data): statistic=6.0000, p=0.0506 (not significant)
No significant pairwise differences found.
Assertion Scoring Results Summary ┏━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ question_… ┃ rag_method ┃ total_ass… ┃ successfu… ┃ failed_a… ┃ overall_a… ┃ avg_ques… ┃ total_que… ┃ top_k_us… ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ data_link │ vector_ra… │ 134 │ 121 │ 13 │ 0.9029850… │ 0.922333… │ 50 │ all │ │ data_link │ vector_ra… │ 134 │ 112 │ 22 │ 0.8358208… │ 0.854333… │ 50 │ all │ │ data_local │ vector_ra… │ 67 │ 65 │ 2 │ 0.9701492… │ 0.973333… │ 50 │ all │ │ data_local │ vector_ra… │ 67 │ 65 │ 2 │ 0.9701492… │ 0.973333… │ 50 │ all │ └────────────┴────────────┴────────────┴────────────┴───────────┴────────────┴───────────┴────────────┴───────────┘
rich_print("\nModel usage statistics:")
rich_print(llm_client.get_usage())
Model usage statistics:
{ 'model': 'gpt-5.2', 'prompt_tokens': 596720, 'completion_tokens': 77619, 'total_tokens': 674339, 'prompt_cached_tokens': 0, 'completion_reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0, 'total_calls': 804 }
Hierarchical Assertions (for data-global questions)¶
For data-global questions, we use hierarchical assertions. These have a global assertion with supporting (local) assertions, providing deeper insight into answer quality:
- Global assertion pass/fail: Whether the main assertion is satisfied
- Support coverage: What fraction of supporting assertions are satisfied
- Discovery detection: Whether the answer contains relevant information beyond what's covered by the supporting assertions
This is particularly useful for global questions where answers may partially satisfy complex requirements.
# Config LLM model to be used as judge
llm_config = LLMConfig(
model="gpt-5.2",
api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
llm_provider=LLMProvider.OpenAIChat,
concurrent_requests=100,
call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config for hierarchical assertion scoring (data-global only)
hierarchical_assertions_file = (
"data_global_assertions.json" # Contains assertions with supporting_assertions
)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
pass_threshold = 0.5
trials = 2 # number of trials for each assertion
# Evaluation mode: JOINT or STAGED
# - JOINT: Evaluate global and supporting assertions together in one LLM call. Cheaper but may be less accurate than the STAGED mode.
# - STAGED: Evaluate global assertions first, then supporting assertions only for passed globals.
# Ensures global pass rate matches standard scoring.
hierarchical_mode = HierarchicalMode.STAGED
input_dir = Path("./example_answers")
output_dir = Path("./output/hierarchical_assertion_scores")
if not output_dir.exists():
output_dir.mkdir(parents=True)
# Load hierarchical assertions for data-global questions
# The file structure has assertions with a "supporting_assertions" field
assertions = load_and_normalize_hierarchical_assertions(
input_dir / hierarchical_assertions_file,
)
rich_print(
f"Loaded {len(assertions)} hierarchical assertions for data-global questions"
)
Loaded 856 hierarchical assertions for data-global questions
# Run hierarchical assertion evaluation for all RAG methods
# This handles scoring, aggregation, comparison, and significance tests
comparison_df = run_hierarchical_assertion_evaluation(
llm_client=llm_client,
llm_config=llm_config,
generated_rags=generated_rags,
assertions=assertions,
input_dir=str(input_dir),
output_dir=output_dir,
trials=trials,
pass_threshold=pass_threshold,
mode=hierarchical_mode,
answers_path_template="{input_dir}/{generated_rag}/data_global_answers.json",
run_significance_test=True,
significance_alpha=0.05,
significance_correction="holm",
# Optional: run assertion-level clustered permutation tests
# as a secondary analysis that accounts for within-question correlation
run_clustered_permutation=True,
n_permutations=10_000,
permutation_seed=42,
question_id_key="question_id",
question_text_key="question", # Column name in the answer files
answer_text_key="answer",
supporting_assertions_key="supporting_assertions",
)
Processing vector_rag_short_context
Step 1: Standard global assertion scoring...
Output()
Global assertions: 401/856 passed (46.8%)
Global pass rate (per-question avg): 47.8%
Step 2: Supporting + discovery for 401 passed assertions...
Output()
# The pipeline automatically prints and saves a significance summary table.
# You can also load it from CSV for further analysis:
sig_summary = pd.read_csv(output_dir / "significance_summary.csv")
print_df(sig_summary, "Significance Test Summary")
notebook controller is DISPOSED. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
# Alternatively, you can run significance tests separately and build the summary
# yourself using summarize_significance_results(). This is useful when you have
# pre-computed aggregated scores and want to re-run tests with different parameters.
# See the compare_hierarchical_assertion_scores_significance and
# summarize_significance_results functions in benchmark_qed.autoe.assertion.
notebook controller is DISPOSED. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.