In [ ]:

Copied!

# Copyright (c) 2025 Microsoft Corporation.
import sys

sys.path.insert(1, "../../../")
# Copyright (c) 2025 Microsoft Corporation.
import sys

sys.path.insert(1, "../../../")

In [ ]:

Copied!





import logging
import os

from pydantic import SecretStr

from benchmark_qed.autod.data_processor.embedding import TextEmbedder
from benchmark_qed.autod.data_processor.text_splitting import TokenTextSplitter
from benchmark_qed.autod.io.document import (
    create_documents,
    save_documents,
)
from benchmark_qed.autod.io.text_unit import create_text_units, save_text_units
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.llm.factory import ModelFactory

logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.ERROR)
import logging
import os

from pydantic import SecretStr

from benchmark_qed.autod.data_processor.embedding import TextEmbedder
from benchmark_qed.autod.data_processor.text_splitting import TokenTextSplitter
from benchmark_qed.autod.io.document import (
    create_documents,
    save_documents,
)
from benchmark_qed.autod.io.text_unit import create_text_units, save_text_units
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.llm.factory import ModelFactory

logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.ERROR)

In [ ]:

Copied!

%load_ext dotenv
%dotenv
%load_ext dotenv
%dotenv

AutoD¶

AutoD provides utilities for sampling datasets to match a target specification, defined in terms of the breadth (number of topic clusters to sample from) and depth (the number of samples per cluster) of data units (e.g. documents). It also provides ability to generate dataset summaries using a map-reduce approach.

Configs¶

In [ ]:

Copied!





INPUT_DATA_PATH = "../../datasets/AP_news/raw_data"
OUTPUT_DATA_PATH = "./output/AP_news/processed_data"
TEXT_COLUMN = "body_nitf"
METADATA_COLUMNS = ["headline", "firstcreated"]
JSON_ENCODING = "utf-8-sig"

# tokenizer used for chunking documents into text units
ENCODING_MODEL = "o200k_base"
CHUNK_SIZE = 600
CHUNK_OVERLAP = 100

# llm/embedding settings
API_KEY = SecretStr(os.getenv("OPENAI_API_KEY", ""))
EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1"
INPUT_DATA_PATH = "../../datasets/AP_news/raw_data"
OUTPUT_DATA_PATH = "./output/AP_news/processed_data"
TEXT_COLUMN = "body_nitf"
METADATA_COLUMNS = ["headline", "firstcreated"]
JSON_ENCODING = "utf-8-sig"

# tokenizer used for chunking documents into text units
ENCODING_MODEL = "o200k_base"
CHUNK_SIZE = 600
CHUNK_OVERLAP = 100

# llm/embedding settings
API_KEY = SecretStr(os.getenv("OPENAI_API_KEY", ""))
EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1"

Load documents¶

Supports CSV, JSON, and TXT

In [ ]:

Copied!





documents = create_documents(
    input_path=INPUT_DATA_PATH,
    input_type="json",
    text_tag=TEXT_COLUMN,
    metadata_tags=METADATA_COLUMNS,
    encoding=JSON_ENCODING,
)
document_df = save_documents(documents, OUTPUT_DATA_PATH)
print(f"Document count: {len(document_df)}")
document_df.head()
documents = create_documents(
    input_path=INPUT_DATA_PATH,
    input_type="json",
    text_tag=TEXT_COLUMN,
    metadata_tags=METADATA_COLUMNS,
    encoding=JSON_ENCODING,
)
document_df = save_documents(documents, OUTPUT_DATA_PATH)
print(f"Document count: {len(document_df)}")
document_df.head()

Create text units¶

Chunk documents into text units of the specified chunk size and overlap and embed all text units.

In [ ]:

Copied!





text_splitter = TokenTextSplitter(
    encoding_name=ENCODING_MODEL,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

text_embedder = TextEmbedder(
    ModelFactory.create_embedding_model(
        LLMConfig(
            model=EMBEDDING_MODEL,
            api_key=API_KEY,
            llm_provider=LLMProvider.OpenAIEmbedding,
        )
    )
)

text_units = await create_text_units(
    documents=documents,
    metadata_tags=METADATA_COLUMNS,
    text_splitter=text_splitter,
    text_embedder=text_embedder,
    embed_text=True,
)
text_unit_df = save_text_units(text_units, OUTPUT_DATA_PATH)
print(f"Text unit count: {len(text_unit_df)}")
text_unit_df.head()
text_splitter = TokenTextSplitter(
    encoding_name=ENCODING_MODEL,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

text_embedder = TextEmbedder(
    ModelFactory.create_embedding_model(
        LLMConfig(
            model=EMBEDDING_MODEL,
            api_key=API_KEY,
            llm_provider=LLMProvider.OpenAIEmbedding,
        )
    )
)

text_units = await create_text_units(
    documents=documents,
    metadata_tags=METADATA_COLUMNS,
    text_splitter=text_splitter,
    text_embedder=text_embedder,
    embed_text=True,
)
text_unit_df = save_text_units(text_units, OUTPUT_DATA_PATH)
print(f"Text unit count: {len(text_unit_df)}")
text_unit_df.head()

Sample text units using a Kmeans-based sampler¶

The sampling process consists of 3 steps:

Input text units are first clustered into K-clusters using Kmeans
Select a representative unit for each cluster
For each representative, select N nearest neighbors

In [ ]:

Copied!

from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType
from benchmark_qed.autod.sampler.sampling.kmeans_sampler import KmeansTextSampler
from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType
from benchmark_qed.autod.sampler.sampling.kmeans_sampler import KmeansTextSampler

In [ ]:

Copied!

NUM_CLUSTERS = 50
NUM_SAMPLES_PER_CLUSTER = 10
NUM_CLUSTERS = 50
NUM_SAMPLES_PER_CLUSTER = 10

In [ ]:

Copied!





sampler = KmeansTextSampler()
sampled_text_units = sampler.sample(
    text_units=text_units,
    sample_size=None,
    num_clusters=NUM_CLUSTERS,
    num_samples_per_cluster=NUM_SAMPLES_PER_CLUSTER,
    cluster_representative_selection_type=ClusterRepresentativeSelectionType.CENTROID,
)
print(f"Sampled text unit count: {len(sampled_text_units)}")
sampler = KmeansTextSampler()
sampled_text_units = sampler.sample(
    text_units=text_units,
    sample_size=None,
    num_clusters=NUM_CLUSTERS,
    num_samples_per_cluster=NUM_SAMPLES_PER_CLUSTER,
    cluster_representative_selection_type=ClusterRepresentativeSelectionType.CENTROID,
)
print(f"Sampled text unit count: {len(sampled_text_units)}")

In [ ]:

Copied!





# Quality check: check number of clusters
clusters = [
    sampled_text_units[i : i + NUM_SAMPLES_PER_CLUSTER]
    for i in range(0, len(sampled_text_units), NUM_SAMPLES_PER_CLUSTER)
]
print(f"Cluster count: {len(clusters)}")

# print first cluster
print("First cluster:")
for i, text_unit in enumerate(clusters[0]):
    print(f"Text {i}: {text_unit.text}")
    print("----------------------")
# Quality check: check number of clusters
clusters = [
    sampled_text_units[i : i + NUM_SAMPLES_PER_CLUSTER]
    for i in range(0, len(sampled_text_units), NUM_SAMPLES_PER_CLUSTER)
]
print(f"Cluster count: {len(clusters)}")

# print first cluster
print("First cluster:")
for i, text_unit in enumerate(clusters[0]):
    print(f"Text {i}: {text_unit.text}")
    print("----------------------")

Summarize sampled text units using map-reduce¶

In [ ]:

Copied!

import tiktoken

from benchmark_qed.autod.summarization.global_summarizer import GlobalSummarizer
import tiktoken

from benchmark_qed.autod.summarization.global_summarizer import GlobalSummarizer

In [ ]:

Copied!





# adjust this based on your model. For example, some reasoning models do not support temperature settings
LLM_PARAMS = {"temperature": 0.0, "seed": 42}

llm = ModelFactory.create_chat_model(
    model_config=LLMConfig(
        model=LLM_MODEL,
        api_key=API_KEY,
        llm_provider=LLMProvider.OpenAIChat,
        call_args=LLM_PARAMS,
    )
)
token_encoder = tiktoken.get_encoding(ENCODING_MODEL)

summarizer = GlobalSummarizer(
    llm=llm,
    token_encoder=token_encoder,
    response_type="single paragraph",
    max_data_tokens=8000,
    map_llm_params=LLM_PARAMS,
    reduce_llm_params=LLM_PARAMS,
    concurrent_coroutines=32,
)

summary_result = await summarizer.asummarize(
    text_units=sampled_text_units,
)
print(f"Summary: {summary_result.summary}")
# adjust this based on your model. For example, some reasoning models do not support temperature settings
LLM_PARAMS = {"temperature": 0.0, "seed": 42}

llm = ModelFactory.create_chat_model(
    model_config=LLMConfig(
        model=LLM_MODEL,
        api_key=API_KEY,
        llm_provider=LLMProvider.OpenAIChat,
        call_args=LLM_PARAMS,
    )
)
token_encoder = tiktoken.get_encoding(ENCODING_MODEL)

summarizer = GlobalSummarizer(
    llm=llm,
    token_encoder=token_encoder,
    response_type="single paragraph",
    max_data_tokens=8000,
    map_llm_params=LLM_PARAMS,
    reduce_llm_params=LLM_PARAMS,
    concurrent_coroutines=32,
)

summary_result = await summarizer.asummarize(
    text_units=sampled_text_units,
)
print(f"Summary: {summary_result.summary}")