In [ ]:
Copied!
# Copyright (c) 2025 Microsoft Corporation.
import sys
sys.path.insert(1, "../../../")
# Copyright (c) 2025 Microsoft Corporation.
import sys
sys.path.insert(1, "../../../")
In [ ]:
Copied!
import logging
import os
from pydantic import SecretStr
from benchmark_qed.autod.data_processor.embedding import TextEmbedder
from benchmark_qed.autod.data_processor.text_splitting import TokenTextSplitter
from benchmark_qed.autod.io.document import (
create_documents,
save_documents,
)
from benchmark_qed.autod.io.text_unit import create_text_units, save_text_units
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.llm.factory import ModelFactory
logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.ERROR)
import logging
import os
from pydantic import SecretStr
from benchmark_qed.autod.data_processor.embedding import TextEmbedder
from benchmark_qed.autod.data_processor.text_splitting import TokenTextSplitter
from benchmark_qed.autod.io.document import (
create_documents,
save_documents,
)
from benchmark_qed.autod.io.text_unit import create_text_units, save_text_units
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.llm.factory import ModelFactory
logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.ERROR)
In [ ]:
Copied!
%load_ext dotenv
%dotenv
%load_ext dotenv
%dotenv
AutoD¶
AutoD provides utilities for sampling datasets to match a target specification, defined in terms of the breadth (number of topic clusters to sample from) and depth (the number of samples per cluster) of data units (e.g. documents). It also provides ability to generate dataset summaries using a map-reduce approach.
Configs¶
In [ ]:
Copied!
INPUT_DATA_PATH = "../../datasets/AP_news/raw_data"
OUTPUT_DATA_PATH = "./output/AP_news/processed_data"
TEXT_COLUMN = "body_nitf"
METADATA_COLUMNS = ["headline", "firstcreated"]
JSON_ENCODING = "utf-8-sig"
# tokenizer used for chunking documents into text units
ENCODING_MODEL = "o200k_base"
CHUNK_SIZE = 600
CHUNK_OVERLAP = 100
# llm/embedding settings
API_KEY = SecretStr(os.getenv("OPENAI_API_KEY", ""))
EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1"
INPUT_DATA_PATH = "../../datasets/AP_news/raw_data"
OUTPUT_DATA_PATH = "./output/AP_news/processed_data"
TEXT_COLUMN = "body_nitf"
METADATA_COLUMNS = ["headline", "firstcreated"]
JSON_ENCODING = "utf-8-sig"
# tokenizer used for chunking documents into text units
ENCODING_MODEL = "o200k_base"
CHUNK_SIZE = 600
CHUNK_OVERLAP = 100
# llm/embedding settings
API_KEY = SecretStr(os.getenv("OPENAI_API_KEY", ""))
EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1"
Load documents¶
- Supports CSV, JSON, and TXT
In [ ]:
Copied!
documents = create_documents(
input_path=INPUT_DATA_PATH,
input_type="json",
text_tag=TEXT_COLUMN,
metadata_tags=METADATA_COLUMNS,
encoding=JSON_ENCODING,
)
document_df = save_documents(documents, OUTPUT_DATA_PATH)
print(f"Document count: {len(document_df)}")
document_df.head()
documents = create_documents(
input_path=INPUT_DATA_PATH,
input_type="json",
text_tag=TEXT_COLUMN,
metadata_tags=METADATA_COLUMNS,
encoding=JSON_ENCODING,
)
document_df = save_documents(documents, OUTPUT_DATA_PATH)
print(f"Document count: {len(document_df)}")
document_df.head()
Create text units¶
Chunk documents into text units of the specified chunk size and overlap and embed all text units.
In [ ]:
Copied!
text_splitter = TokenTextSplitter(
encoding_name=ENCODING_MODEL,
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
)
text_embedder = TextEmbedder(
ModelFactory.create_embedding_model(
LLMConfig(
model=EMBEDDING_MODEL,
api_key=API_KEY,
llm_provider=LLMProvider.OpenAIEmbedding,
)
)
)
text_units = await create_text_units(
documents=documents,
metadata_tags=METADATA_COLUMNS,
text_splitter=text_splitter,
text_embedder=text_embedder,
embed_text=True,
)
text_unit_df = save_text_units(text_units, OUTPUT_DATA_PATH)
print(f"Text unit count: {len(text_unit_df)}")
text_unit_df.head()
text_splitter = TokenTextSplitter(
encoding_name=ENCODING_MODEL,
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
)
text_embedder = TextEmbedder(
ModelFactory.create_embedding_model(
LLMConfig(
model=EMBEDDING_MODEL,
api_key=API_KEY,
llm_provider=LLMProvider.OpenAIEmbedding,
)
)
)
text_units = await create_text_units(
documents=documents,
metadata_tags=METADATA_COLUMNS,
text_splitter=text_splitter,
text_embedder=text_embedder,
embed_text=True,
)
text_unit_df = save_text_units(text_units, OUTPUT_DATA_PATH)
print(f"Text unit count: {len(text_unit_df)}")
text_unit_df.head()
Sample text units using a Kmeans-based sampler¶
The sampling process consists of 3 steps:
- Input text units are first clustered into K-clusters using Kmeans
- Select a representative unit for each cluster
- For each representative, select N nearest neighbors
In [ ]:
Copied!
from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType
from benchmark_qed.autod.sampler.sampling.kmeans_sampler import KmeansTextSampler
from benchmark_qed.autod.sampler.enums import ClusterRepresentativeSelectionType
from benchmark_qed.autod.sampler.sampling.kmeans_sampler import KmeansTextSampler
In [ ]:
Copied!
NUM_CLUSTERS = 50
NUM_SAMPLES_PER_CLUSTER = 10
NUM_CLUSTERS = 50
NUM_SAMPLES_PER_CLUSTER = 10
In [ ]:
Copied!
sampler = KmeansTextSampler()
sampled_text_units = sampler.sample(
text_units=text_units,
sample_size=None,
num_clusters=NUM_CLUSTERS,
num_samples_per_cluster=NUM_SAMPLES_PER_CLUSTER,
cluster_representative_selection_type=ClusterRepresentativeSelectionType.CENTROID,
)
print(f"Sampled text unit count: {len(sampled_text_units)}")
sampler = KmeansTextSampler()
sampled_text_units = sampler.sample(
text_units=text_units,
sample_size=None,
num_clusters=NUM_CLUSTERS,
num_samples_per_cluster=NUM_SAMPLES_PER_CLUSTER,
cluster_representative_selection_type=ClusterRepresentativeSelectionType.CENTROID,
)
print(f"Sampled text unit count: {len(sampled_text_units)}")
In [ ]:
Copied!
# Quality check: check number of clusters
clusters = [
sampled_text_units[i : i + NUM_SAMPLES_PER_CLUSTER]
for i in range(0, len(sampled_text_units), NUM_SAMPLES_PER_CLUSTER)
]
print(f"Cluster count: {len(clusters)}")
# print first cluster
print("First cluster:")
for i, text_unit in enumerate(clusters[0]):
print(f"Text {i}: {text_unit.text}")
print("----------------------")
# Quality check: check number of clusters
clusters = [
sampled_text_units[i : i + NUM_SAMPLES_PER_CLUSTER]
for i in range(0, len(sampled_text_units), NUM_SAMPLES_PER_CLUSTER)
]
print(f"Cluster count: {len(clusters)}")
# print first cluster
print("First cluster:")
for i, text_unit in enumerate(clusters[0]):
print(f"Text {i}: {text_unit.text}")
print("----------------------")
Summarize sampled text units using map-reduce¶
In [ ]:
Copied!
import tiktoken
from benchmark_qed.autod.summarization.global_summarizer import GlobalSummarizer
import tiktoken
from benchmark_qed.autod.summarization.global_summarizer import GlobalSummarizer
In [ ]:
Copied!
# adjust this based on your model. For example, some reasoning models do not support temperature settings
LLM_PARAMS = {"temperature": 0.0, "seed": 42}
llm = ModelFactory.create_chat_model(
model_config=LLMConfig(
model=LLM_MODEL,
api_key=API_KEY,
llm_provider=LLMProvider.OpenAIChat,
call_args=LLM_PARAMS,
)
)
token_encoder = tiktoken.get_encoding(ENCODING_MODEL)
summarizer = GlobalSummarizer(
llm=llm,
token_encoder=token_encoder,
response_type="single paragraph",
max_data_tokens=8000,
map_llm_params=LLM_PARAMS,
reduce_llm_params=LLM_PARAMS,
concurrent_coroutines=32,
)
summary_result = await summarizer.asummarize(
text_units=sampled_text_units,
)
print(f"Summary: {summary_result.summary}")
# adjust this based on your model. For example, some reasoning models do not support temperature settings
LLM_PARAMS = {"temperature": 0.0, "seed": 42}
llm = ModelFactory.create_chat_model(
model_config=LLMConfig(
model=LLM_MODEL,
api_key=API_KEY,
llm_provider=LLMProvider.OpenAIChat,
call_args=LLM_PARAMS,
)
)
token_encoder = tiktoken.get_encoding(ENCODING_MODEL)
summarizer = GlobalSummarizer(
llm=llm,
token_encoder=token_encoder,
response_type="single paragraph",
max_data_tokens=8000,
map_llm_params=LLM_PARAMS,
reduce_llm_params=LLM_PARAMS,
concurrent_coroutines=32,
)
summary_result = await summarizer.asummarize(
text_units=sampled_text_units,
)
print(f"Summary: {summary_result.summary}")