DRIFT Search
In [1]:
Copied!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
In [2]:
Copied!
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description', 'text_unit_ids', 'frequency', 'degree', 'x', 'y'], dtype='object') Entity count: 18 Relationship count: 54 Text unit records: 5
Out[2]:
id | human_readable_id | text | n_tokens | document_ids | entity_ids | relationship_ids | covariate_ids | |
---|---|---|---|---|---|---|---|---|
0 | 8e938693af886bfd081acbbe8384c3671446bff84a134a... | 1 | # Operation: Dulce\n\n## Chapter 1\n\nThe thru... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185... |
1 | fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443... | 2 | , the hollow echo of the bay a stark reminder ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618... |
2 | 7296d9a1f046854d59079dc183de8a054c27c4843d2979... | 3 | differently than praise from others. This was... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20... |
3 | ac72722a02ac71242a2a91fca323198d04197daf60515d... | 4 | contrast to the rigid silence enveloping the ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a... |
4 | 4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d... | 5 | a mask of duty.\n\nIn the midst of the descen... | 35 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [d084d615-3584-4ec8-9931-90aa6075c764, 4b84859... | [6efdc42e-69a2-47c0-97ec-4b296cd16d5e] | [db8da02f-f889-4bb5-8e81-ab2a72e380bb] |
In [3]:
Copied!
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
In [4]:
Copied!
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
In [5]:
Copied!
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
In [6]:
Copied!
resp = await search.search("Who is agent Mercer?")
resp = await search.search("Who is agent Mercer?")
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:10<00:00, 10.21s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:10<00:20, 10.40s/it]
67%|██████▋ | 2/3 [00:11<00:04, 4.74s/it]
100%|██████████| 3/3 [00:12<00:00, 3.05s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:07<00:14, 7.39s/it]
67%|██████▋ | 2/3 [00:10<00:04, 4.61s/it]
100%|██████████| 3/3 [00:12<00:00, 3.50s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:08<00:16, 8.32s/it]
67%|██████▋ | 2/3 [00:08<00:03, 3.70s/it]
100%|██████████| 3/3 [00:09<00:00, 2.12s/it]
In [7]:
Copied!
resp.response
resp.response
Out[7]:
"Agent Alex Mercer is a pivotal figure within the Paranormal Military Squad, playing a central role in Operation: Dulce. This operation involves exploring the Dulce base, rumored to house advanced alien technology. Mercer's role is multifaceted, encompassing leadership, mentorship, and active participation in the mission's objectives [Data: Reports (1)].\n\n### Leadership and Mentorship\nMercer is noted for his leadership style, which emphasizes intuition, trust, and mentorship. He provides guidance and support to team members, particularly Sam Rivera, a cybersecurity expert. Mercer's mentorship is crucial in helping Rivera excel in decrypting data and analyzing communications related to the Dulce base [Data: Reports (1); Sources (3)].\n\n### Role in Operation: Dulce\nMercer is directly involved in the exploration of the Dulce base, collaborating with other key agents like Taylor Cruz and Dr. Jordan Hayes. His leadership style contrasts with Cruz's authoritative approach, highlighting differing strategies within the team. Mercer's collaboration with Dr. Hayes emphasizes the integration of scientific expertise into the mission's strategy [Data: Reports (1); Sources (0, 1)].\n\n### Challenges\nMercer faces several challenges, including navigating team dynamics and the psychological weight of the mission. He must balance his intuitive approach with the protocol-driven style of Cruz, manage the technical and strategic aspects of the mission, and maintain team morale [Data: Reports (1); Sources (0, 3)].\n\nOverall, Agent Alex Mercer is a central figure in the success of Operation: Dulce, exemplifying the importance of cohesive teamwork and strategic planning in high-stakes operations."
In [8]:
Copied!
print(resp.context_data)
print(resp.context_data)
{'What is the background of Agent Alex Mercer before joining the Paranormal Military Squad?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does Agent Mercer's leadership style differ from that of Agent Taylor Cruz?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What specific challenges did Agent Mercer face during Operation: Dulce?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'How does Agent Mercer mentor Sam Rivera in the context of the mission?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the implications of the alien technology discovered at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, 'What challenges does Mercer face in his role as a mentor?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the challenges Alex Mercer faces in Operation: Dulce?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does Mercer's leadership style differ from Taylor Cruz's?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How do other team members perceive Mercer's and Cruz's leadership styles?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}}