In [ ]:

Copied!





# Copyright (c) 2025 Microsoft Corporation.
import os
from pathlib import Path
from typing import cast

import pandas as pd
from pydantic import SecretStr
from rich import print as rich_print

from benchmark_qed.autoe.assertion import (
    HierarchicalMode,
    load_and_normalize_hierarchical_assertions,
    run_assertion_evaluation,
    run_hierarchical_assertion_evaluation,
)
from benchmark_qed.autoe.pairwise import analyze_criteria, get_pairwise_scores
from benchmark_qed.autoe.reference import (
    get_reference_scores,
    summarize_reference_scores,
)
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import (
    LLMConfig,
    LLMProvider,
)
from benchmark_qed.config.model.score import (
    pairwise_scores_criteria,
    reference_scores_criteria,
)
from benchmark_qed.llm.factory import ModelFactory
# Copyright (c) 2025 Microsoft Corporation.
import os
from pathlib import Path
from typing import cast

import pandas as pd
from pydantic import SecretStr
from rich import print as rich_print

from benchmark_qed.autoe.assertion import (
    HierarchicalMode,
    load_and_normalize_hierarchical_assertions,
    run_assertion_evaluation,
    run_hierarchical_assertion_evaluation,
)
from benchmark_qed.autoe.pairwise import analyze_criteria, get_pairwise_scores
from benchmark_qed.autoe.reference import (
    get_reference_scores,
    summarize_reference_scores,
)
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import (
    LLMConfig,
    LLMProvider,
)
from benchmark_qed.config.model.score import (
    pairwise_scores_criteria,
    reference_scores_criteria,
)
from benchmark_qed.llm.factory import ModelFactory

AutoE¶

In [ ]:

Copied!

import nest_asyncio

nest_asyncio.apply()
import nest_asyncio

nest_asyncio.apply()

In [ ]:

Copied!

%load_ext dotenv
%dotenv
%load_ext dotenv
%dotenv

Pairwise Comparisons of RAG Methods¶

The AutoE component automates relative comparisons of RAG methods using the LLM-as-a-judge approach. It presents an LLM with pairs of answers, along with the query and target metric, in a counterbalanced order. The model then judges whether the first answer wins, loses, or ties with the second. Aggregating these judgments across multiple queries and trials yields win rates for each method.

In the example below, we compare Vector RAG with short context (retrieves 50 text chunks) against Vector RAG with long context (retrieves 200 text chunks). We use synthetic questions generated from AP News health-related articles using AutoQ, covering data-global, data-local, and data-linked question types. Each query is evaluated in 4 counterbalanced trials across four default metrics (comprehensiveness, diversity, empowerment, and relevance), using GPT-4.1 as the judge.

Choosing the right LLM judge is critical: less capable models may introduce biases and yield unreliable results. A useful first step in validating a judge model is to run an A/A test — comparing a RAG method against itself. This should result in a ~0.5 win rate with no statistically significant differences.

In [ ]:

Copied!





# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=32,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=32,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [ ]:

Copied!





# Config conditions for comparison
base = "vector_rag_short_context"
others = ["vector_rag_long_context"]
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4  # number of trials to run for each combination of [query, base, other]. Trials must be an even number to support counterbalancing.
alpha = 0.05  # significance level used for statistical tests

input_dir = "./example_answers"
output_dir = Path("./output/win_rates")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# load default criteria. You can also define your own criteria as a list Criteria objects
criteria = pairwise_scores_criteria()
# Config conditions for comparison
base = "vector_rag_short_context"
others = ["vector_rag_long_context"]
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4  # number of trials to run for each combination of [query, base, other]. Trials must be an even number to support counterbalancing.
alpha = 0.05  # significance level used for statistical tests

input_dir = "./example_answers"
output_dir = Path("./output/win_rates")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# load default criteria. You can also define your own criteria as a list Criteria objects
criteria = pairwise_scores_criteria()

In [ ]:

Copied!





# run pairwise comparisons for each question set and each pair of [base, other].
all_results = []
for question_set in question_sets:
    for other in others:
        rich_print(f"Processing {base} vs {other} for question set: {question_set}")
        result = get_pairwise_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            base_name=base,
            other_name=other,
            base_answers=pd.read_json(
                f"{input_dir}/{base}/{question_set}_answers.json"
            ),
            other_answers=pd.read_json(
                f"{input_dir}/{other}/{question_set}_answers.json"
            ),
            criteria=criteria,
            trials=trials,
            include_score_id_in_prompt=True,
            question_id_key="question_id",
            question_text_key="question",  # Column name in the answer files
        )
        result["question_set"] = question_set
        all_results.append(result)

        # save pairwise results for each question set and pair of [base, other]
        result.to_csv(
            output_dir / f"{question_set}_{base}--{other}.csv",
            index=False,
        )

# save all pairwise results in a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "win_rates.csv", index=False)

# perform significance testing on the results
significance_test_results = analyze_criteria(
    all_results_df,
    alpha=alpha,
)
significance_test_results.to_csv(output_dir / "winrates_sig_tests.csv", index=False)

print_df(
    cast(
        pd.DataFrame,
        significance_test_results[
            [
                "question_set",
                "criteria",
                "base_name",
                "other_name",
                "base_mean",
                "other_mean",
                "formatted_corrected_p_value",
            ]
        ],
    ),
    "Win Rates Summary",
)
# run pairwise comparisons for each question set and each pair of [base, other].
all_results = []
for question_set in question_sets:
    for other in others:
        rich_print(f"Processing {base} vs {other} for question set: {question_set}")
        result = get_pairwise_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            base_name=base,
            other_name=other,
            base_answers=pd.read_json(
                f"{input_dir}/{base}/{question_set}_answers.json"
            ),
            other_answers=pd.read_json(
                f"{input_dir}/{other}/{question_set}_answers.json"
            ),
            criteria=criteria,
            trials=trials,
            include_score_id_in_prompt=True,
            question_id_key="question_id",
            question_text_key="question",  # Column name in the answer files
        )
        result["question_set"] = question_set
        all_results.append(result)

        # save pairwise results for each question set and pair of [base, other]
        result.to_csv(
            output_dir / f"{question_set}_{base}--{other}.csv",
            index=False,
        )

# save all pairwise results in a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "win_rates.csv", index=False)

# perform significance testing on the results
significance_test_results = analyze_criteria(
    all_results_df,
    alpha=alpha,
)
significance_test_results.to_csv(output_dir / "winrates_sig_tests.csv", index=False)

print_df(
    cast(
        pd.DataFrame,
        significance_test_results[
            [
                "question_set",
                "criteria",
                "base_name",
                "other_name",
                "base_mean",
                "other_mean",
                "formatted_corrected_p_value",
            ]
        ],
    ),
    "Win Rates Summary",
)

In [ ]:

Copied!

rich_print("Model usage statistics:")
rich_print(llm_client.metrics_store.get_metrics())
rich_print("Model usage statistics:")
rich_print(llm_client.metrics_store.get_metrics())

Reference-based Scoring¶

When reference answers (such as ground truth or "gold standard" responses) are available, AutoE can evaluate RAG-generated answers against these references using metrics like correctness, completeness, or other user-defined criteria on a customizable scoring scale.

In the example below, we use the long context version as the reference method. Note that this is not ground truth—we use it here purely to demonstrate the reference-based scoring workflow. While more context can sometimes lead to more complete answers, this relationship is not universal: it depends on the model, the specific context window, and the nature of the questions. We then score the answers from short context against those from the long context version using the default metrics (correctness and completeness) on a scale from 1 to 10.

In [ ]:

Copied!





# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=32,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=32,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [ ]:

Copied!





# Config conditions for comparison
reference = (
    "vector_rag_long_context"  # long context as reference (more complete answers)
)
generated_rags = ["vector_rag_short_context"]  # short context to evaluate
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4  # number of trials must be an even number to support counterbalancing

input_dir = "./example_answers"
output_dir = Path("./output/reference_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# load default criteria (correctness and completeness). You can also define your own criteria as a list Criteria objects
criteria = reference_scores_criteria()
# Config conditions for comparison
reference = (
    "vector_rag_long_context"  # long context as reference (more complete answers)
)
generated_rags = ["vector_rag_short_context"]  # short context to evaluate
question_sets = ["data_global", "data_local", "data_linked"]
trials = 4  # number of trials must be an even number to support counterbalancing

input_dir = "./example_answers"
output_dir = Path("./output/reference_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# load default criteria (correctness and completeness). You can also define your own criteria as a list Criteria objects
criteria = reference_scores_criteria()

In [ ]:

Copied!





# run comparisons for each question set and each pair of [generated, reference].
all_results = []
all_summaries = []
question_set = ""
for question_set in question_sets:
    for generated in generated_rags:
        rich_print(
            f"Comparing {generated} vs. {reference} for question set: {question_set}"
        )
        result = get_reference_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            reference_answers=pd.read_json(
                f"{input_dir}/{reference}/{question_set}_answers.json"
            ),
            generated_answers=pd.read_json(
                f"{input_dir}/{generated}/{question_set}_answers.json"
            ),
            criteria=criteria,
            trials=trials,
            score_min=1,
            score_max=10,
            include_score_id_in_prompt=True,
            question_id_key="question_id",
            question_text_key="question",  # Column name in the answer files
        )

        all_results.append(result)
        result.to_csv(
            output_dir / f"{question_set}_{reference}--{generated}.csv",
            index=False,
        )
        summary_df = summarize_reference_scores(result)
        summary_df["question_set"] = question_set
        summary_df["reference"] = reference
        summary_df["generated"] = generated
        all_summaries.append(summary_df)

# save all results into a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "reference_scores.csv", index=False)

all_summary_df = pd.concat(all_summaries, ignore_index=True)
print_df(
    cast(
        pd.DataFrame,
        all_summary_df[
            ["question_set", "criteria", "reference", "generated", "mean", "std"]
        ].reset_index(drop=True),
    ),
    "Reference Scores Summary",
)
all_summary_df.to_csv(output_dir / "reference_scores_summary.csv", index=False)
# run comparisons for each question set and each pair of [generated, reference].
all_results = []
all_summaries = []
question_set = ""
for question_set in question_sets:
    for generated in generated_rags:
        rich_print(
            f"Comparing {generated} vs. {reference} for question set: {question_set}"
        )
        result = get_reference_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            reference_answers=pd.read_json(
                f"{input_dir}/{reference}/{question_set}_answers.json"
            ),
            generated_answers=pd.read_json(
                f"{input_dir}/{generated}/{question_set}_answers.json"
            ),
            criteria=criteria,
            trials=trials,
            score_min=1,
            score_max=10,
            include_score_id_in_prompt=True,
            question_id_key="question_id",
            question_text_key="question",  # Column name in the answer files
        )

        all_results.append(result)
        result.to_csv(
            output_dir / f"{question_set}_{reference}--{generated}.csv",
            index=False,
        )
        summary_df = summarize_reference_scores(result)
        summary_df["question_set"] = question_set
        summary_df["reference"] = reference
        summary_df["generated"] = generated
        all_summaries.append(summary_df)

# save all results into a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "reference_scores.csv", index=False)

all_summary_df = pd.concat(all_summaries, ignore_index=True)
print_df(
    cast(
        pd.DataFrame,
        all_summary_df[
            ["question_set", "criteria", "reference", "generated", "mean", "std"]
        ].reset_index(drop=True),
    ),
    "Reference Scores Summary",
)
all_summary_df.to_csv(output_dir / "reference_scores_summary.csv", index=False)

Assertion-based Scoring¶

Assertion-based scoring evaluates RAG-generated answers by checking whether they contain specific factual assertions or claims that should be present according to a reference or gold standard. This approach is especially useful for tasks where the presence or absence of key facts is more important than holistic correctness or completeness.

Standard Assertions (for data-local and data-linked questions)¶

For data-local and data-linked questions, we use standard (flat) assertions. These are straightforward factual claims that can be independently verified against the generated answer. The LLM judge checks whether each assertion is supported by the answer.

In [ ]:

Copied!





# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=100,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=100,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [ ]:

Copied!





# Config for standard assertion scoring (data-local and data-linked)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
question_sets = [
    "data_local",
    "data_linked",
]  # Standard assertions for these question types
pass_threshold = 0.5
trials = 2

input_dir = Path("./example_answers")
output_dir = Path("./output/assertion_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)
# Config for standard assertion scoring (data-local and data-linked)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
question_sets = [
    "data_local",
    "data_linked",
]  # Standard assertions for these question types
pass_threshold = 0.5
trials = 2

input_dir = Path("./example_answers")
output_dir = Path("./output/assertion_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

In [ ]:

Copied!





# Run standard assertion scoring for multiple RAG methods
# This uses run_assertion_evaluation which handles multiple RAGs and runs significance tests

results_df = run_assertion_evaluation(
    llm_client=llm_client,
    llm_config=llm_config,
    question_sets=question_sets,
    generated_rags=generated_rags,
    input_dir=str(input_dir),
    output_dir=output_dir,
    trials=trials,
    top_k_assertions=None,  # Use all assertions
    pass_threshold=pass_threshold,
    # Assertions are in input_dir (not in RAG subdirs)
    assertions_filename_template=f"{question_set}_assertions.json",
    # Answers are in RAG subdirs with question_set name
    answers_path_template="{input_dir}/{generated_rag}/{question_set}_answers.json",
    run_significance_test=True,  # Run Friedman/Wilcoxon tests
    significance_alpha=0.05,
    significance_correction="holm",
    question_text_key="question",  # Column name in the answer files
    answer_text_key="answer",
)

print_df(results_df, "Assertion Scoring Results Summary")
# Run standard assertion scoring for multiple RAG methods
# This uses run_assertion_evaluation which handles multiple RAGs and runs significance tests

results_df = run_assertion_evaluation(
    llm_client=llm_client,
    llm_config=llm_config,
    question_sets=question_sets,
    generated_rags=generated_rags,
    input_dir=str(input_dir),
    output_dir=output_dir,
    trials=trials,
    top_k_assertions=None,  # Use all assertions
    pass_threshold=pass_threshold,
    # Assertions are in input_dir (not in RAG subdirs)
    assertions_filename_template=f"{question_set}_assertions.json",
    # Answers are in RAG subdirs with question_set name
    answers_path_template="{input_dir}/{generated_rag}/{question_set}_answers.json",
    run_significance_test=True,  # Run Friedman/Wilcoxon tests
    significance_alpha=0.05,
    significance_correction="holm",
    question_text_key="question",  # Column name in the answer files
    answer_text_key="answer",
)

print_df(results_df, "Assertion Scoring Results Summary")

In [ ]:

Copied!

rich_print("\nModel usage statistics:")
rich_print(llm_client.metrics_store.get_metrics())
rich_print("\nModel usage statistics:")
rich_print(llm_client.metrics_store.get_metrics())

Hierarchical Assertions (for data-global questions)¶

For data-global questions, we use hierarchical assertions. These have a global assertion with supporting (local) assertions, providing deeper insight into answer quality:

Global assertion pass/fail: Whether the main assertion is satisfied
Support coverage: What fraction of supporting assertions are satisfied
Discovery detection: Whether the answer contains relevant information beyond what's covered by the supporting assertions

This is particularly useful for global questions where answers may partially satisfy complex requirements.

In [ ]:

Copied!





# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=100,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)
# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5.2",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=100,
    call_args={"temperature": 0.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [ ]:

Copied!





# Config for hierarchical assertion scoring (data-global only)
hierarchical_assertions_file = (
    "data_global_assertions.json"  # Contains assertions with supporting_assertions
)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
pass_threshold = 0.5
trials = 2  # number of trials for each assertion

# Evaluation mode: JOINT or STAGED
# - JOINT: Evaluate global and supporting assertions together in one LLM call. Cheaper but may be less accurate than the STAGED mode.
# - STAGED: Evaluate global assertions first, then supporting assertions only for passed globals.
#           Ensures global pass rate matches standard scoring.
hierarchical_mode = HierarchicalMode.STAGED

input_dir = Path("./example_answers")
output_dir = Path("./output/hierarchical_assertion_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)
# Config for hierarchical assertion scoring (data-global only)
hierarchical_assertions_file = (
    "data_global_assertions.json"  # Contains assertions with supporting_assertions
)
generated_rags = ["vector_rag_short_context", "vector_rag_long_context"]
pass_threshold = 0.5
trials = 2  # number of trials for each assertion

# Evaluation mode: JOINT or STAGED
# - JOINT: Evaluate global and supporting assertions together in one LLM call. Cheaper but may be less accurate than the STAGED mode.
# - STAGED: Evaluate global assertions first, then supporting assertions only for passed globals.
#           Ensures global pass rate matches standard scoring.
hierarchical_mode = HierarchicalMode.STAGED

input_dir = Path("./example_answers")
output_dir = Path("./output/hierarchical_assertion_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

In [ ]:

Copied!





# Load hierarchical assertions for data-global questions
# The file structure has assertions with a "supporting_assertions" field
assertions = load_and_normalize_hierarchical_assertions(
    input_dir / hierarchical_assertions_file,
)
rich_print(
    f"Loaded {len(assertions)} hierarchical assertions for data-global questions"
)
# Load hierarchical assertions for data-global questions
# The file structure has assertions with a "supporting_assertions" field
assertions = load_and_normalize_hierarchical_assertions(
    input_dir / hierarchical_assertions_file,
)
rich_print(
    f"Loaded {len(assertions)} hierarchical assertions for data-global questions"
)

In [ ]:

Copied!





# Run hierarchical assertion evaluation for all RAG methods
# This handles scoring, aggregation, comparison, and significance tests

comparison_df = run_hierarchical_assertion_evaluation(
    llm_client=llm_client,
    llm_config=llm_config,
    generated_rags=generated_rags,
    assertions=assertions,
    input_dir=str(input_dir),
    output_dir=output_dir,
    trials=trials,
    pass_threshold=pass_threshold,
    mode=hierarchical_mode,
    answers_path_template="{input_dir}/{generated_rag}/data_global_answers.json",
    run_significance_test=True,
    significance_alpha=0.05,
    significance_correction="holm",
    # Optional: run assertion-level clustered permutation tests
    # as a secondary analysis that accounts for within-question correlation
    run_clustered_permutation=True,
    n_permutations=10_000,
    permutation_seed=42,
    question_id_key="question_id",
    question_text_key="question",  # Column name in the answer files
    answer_text_key="answer",
    supporting_assertions_key="supporting_assertions",
)
# Run hierarchical assertion evaluation for all RAG methods
# This handles scoring, aggregation, comparison, and significance tests

comparison_df = run_hierarchical_assertion_evaluation(
    llm_client=llm_client,
    llm_config=llm_config,
    generated_rags=generated_rags,
    assertions=assertions,
    input_dir=str(input_dir),
    output_dir=output_dir,
    trials=trials,
    pass_threshold=pass_threshold,
    mode=hierarchical_mode,
    answers_path_template="{input_dir}/{generated_rag}/data_global_answers.json",
    run_significance_test=True,
    significance_alpha=0.05,
    significance_correction="holm",
    # Optional: run assertion-level clustered permutation tests
    # as a secondary analysis that accounts for within-question correlation
    run_clustered_permutation=True,
    n_permutations=10_000,
    permutation_seed=42,
    question_id_key="question_id",
    question_text_key="question",  # Column name in the answer files
    answer_text_key="answer",
    supporting_assertions_key="supporting_assertions",
)

In [ ]:

Copied!





# The pipeline automatically prints and saves a significance summary table.
# You can also load it from CSV for further analysis:
sig_summary = pd.read_csv(output_dir / "significance_summary.csv")
print_df(sig_summary, "Significance Test Summary")
# The pipeline automatically prints and saves a significance summary table.
# You can also load it from CSV for further analysis:
sig_summary = pd.read_csv(output_dir / "significance_summary.csv")
print_df(sig_summary, "Significance Test Summary")

In [ ]:

Copied!





# Alternatively, you can run significance tests separately and build the summary
# yourself using summarize_significance_results(). This is useful when you have
# pre-computed aggregated scores and want to re-run tests with different parameters.
# See the compare_hierarchical_assertion_scores_significance and
# summarize_significance_results functions in benchmark_qed.autoe.assertion.
# Alternatively, you can run significance tests separately and build the summary
# yourself using summarize_significance_results(). This is useful when you have
# pre-computed aggregated scores and want to re-run tests with different parameters.
# See the compare_hierarchical_assertion_scores_significance and
# summarize_significance_results functions in benchmark_qed.autoe.assertion.

Chunk-level Assertion Scoring¶

Instead of scoring a synthesized answer, chunk-level assertion scoring evaluates each retrieved chunk directly against the per-question assertions and reports coverage at each k. This isolates retrieval quality from generation quality.

The input is a standard retrieval-results file using the same data_*_retrieval_results.json schema produced for the retrieval metrics, so no conversion is required:

Each record has a question_id, a text question field, and a context list.
Each context item has a chunk_id, text, and an optional rank. When rank is present on every item, chunks are evaluated in rank order (top-k semantics); otherwise the chunks are assumed to be already pre-sorted in decreasing order of relevance.

Three metrics are reported at each k:

Coverage: macro-averaged pass rate (full + partial support)
Strict Coverage: macro-averaged full-support rate only
Coverage Strength: mean score across assertions

In [ ]:

Copied!





import asyncio
import json

from graphrag_storage.file_storage import FileStorage

from benchmark_qed.autoe.chunk_assertion import run_assertion_eval_chunk_mode
from benchmark_qed.autoe.data_model.retrieval_result import (
    load_retrieval_results_from_dicts,
)
from benchmark_qed.autoe.prompts import assertion as chunk_assertion_prompts

# Retrieval results: standard schema {question_id, text, context: [{chunk_id, text, rank?}]}.
# rank is optional; when absent the chunks are assumed to be pre-sorted by relevance.
generated_rag = "vector_rag_long_context"
retrieval_path = input_dir / generated_rag / "data_global_retrieval_results.json"
retrieval_records = json.loads(retrieval_path.read_text(encoding="utf-8"))
eval_results = load_retrieval_results_from_dicts(
    retrieval_records,
    context_id_key="chunk_id",
    context_text_key="text",
    question_text_key="text",
)

# Assertions: each record carries an "assertions" list of {"statement": ...}.
assertions_records = json.loads(
    (input_dir / "data_global_assertions.json").read_text(encoding="utf-8")
)
question_set = {"assertions": assertions_records}

# Default chunk-assertion prompts shipped with the package.
prompts_dir = Path(chunk_assertion_prompts.__file__).parent
system_prompt = (prompts_dir / "chunk_assertion_system_prompt.txt").read_text(
    encoding="utf-8"
)
user_prompt = (prompts_dir / "chunk_assertion_user_prompt.txt").read_text(
    encoding="utf-8"
)

chunk_output_dir = Path("./output/chunk_assertion_scores")
chunk_output_dir.mkdir(parents=True, exist_ok=True)
chunk_output_storage = FileStorage(str(chunk_output_dir))

summaries = asyncio.run(
    run_assertion_eval_chunk_mode(
        eval_results,
        question_set,
        llm_client=llm_client,
        llm_config=llm_config,
        output_storage=chunk_output_storage,
        pass_threshold=0.5,
        k_list=[5, 10, 20, 50],
        system_prompt=system_prompt,
        user_prompt=user_prompt,
    )
)
import asyncio
import json

from graphrag_storage.file_storage import FileStorage

from benchmark_qed.autoe.chunk_assertion import run_assertion_eval_chunk_mode
from benchmark_qed.autoe.data_model.retrieval_result import (
    load_retrieval_results_from_dicts,
)
from benchmark_qed.autoe.prompts import assertion as chunk_assertion_prompts

# Retrieval results: standard schema {question_id, text, context: [{chunk_id, text, rank?}]}.
# rank is optional; when absent the chunks are assumed to be pre-sorted by relevance.
generated_rag = "vector_rag_long_context"
retrieval_path = input_dir / generated_rag / "data_global_retrieval_results.json"
retrieval_records = json.loads(retrieval_path.read_text(encoding="utf-8"))
eval_results = load_retrieval_results_from_dicts(
    retrieval_records,
    context_id_key="chunk_id",
    context_text_key="text",
    question_text_key="text",
)

# Assertions: each record carries an "assertions" list of {"statement": ...}.
assertions_records = json.loads(
    (input_dir / "data_global_assertions.json").read_text(encoding="utf-8")
)
question_set = {"assertions": assertions_records}

# Default chunk-assertion prompts shipped with the package.
prompts_dir = Path(chunk_assertion_prompts.__file__).parent
system_prompt = (prompts_dir / "chunk_assertion_system_prompt.txt").read_text(
    encoding="utf-8"
)
user_prompt = (prompts_dir / "chunk_assertion_user_prompt.txt").read_text(
    encoding="utf-8"
)

chunk_output_dir = Path("./output/chunk_assertion_scores")
chunk_output_dir.mkdir(parents=True, exist_ok=True)
chunk_output_storage = FileStorage(str(chunk_output_dir))

summaries = asyncio.run(
    run_assertion_eval_chunk_mode(
        eval_results,
        question_set,
        llm_client=llm_client,
        llm_config=llm_config,
        output_storage=chunk_output_storage,
        pass_threshold=0.5,
        k_list=[5, 10, 20, 50],
        system_prompt=system_prompt,
        user_prompt=user_prompt,
    )
)

In [ ]:

Copied!





# Summarize coverage metrics at each k
chunk_results_df = pd.DataFrame([
    {
        "k": label,
        "coverage": summary.coverage,
        "strict_coverage": summary.strict_coverage,
        "coverage_strength": summary.mean_score,
        "mean_chunks": summary.mean_retrieved_chunks,
    }
    for label, summary in sorted(summaries.items())
])
print_df(chunk_results_df, "Chunk-level Assertion Scoring Results")
# Summarize coverage metrics at each k
chunk_results_df = pd.DataFrame([
    {
        "k": label,
        "coverage": summary.coverage,
        "strict_coverage": summary.strict_coverage,
        "coverage_strength": summary.mean_score,
        "mean_chunks": summary.mean_retrieved_chunks,
    }
    for label, summary in sorted(summaries.items())
])
print_df(chunk_results_df, "Chunk-level Assertion Scoring Results")