DRIFT Search
In [1]:
Copied!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
In [2]:
Copied!
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description', 'text_unit_ids', 'frequency', 'degree', 'x', 'y'], dtype='object') Entity count: 18 Relationship count: 54 Text unit records: 5
Out[2]:
id | human_readable_id | text | n_tokens | document_ids | entity_ids | relationship_ids | covariate_ids | |
---|---|---|---|---|---|---|---|---|
0 | 8e938693af886bfd081acbbe8384c3671446bff84a134a... | 1 | # Operation: Dulce\n\n## Chapter 1\n\nThe thru... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185... |
1 | fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443... | 2 | , the hollow echo of the bay a stark reminder ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618... |
2 | 7296d9a1f046854d59079dc183de8a054c27c4843d2979... | 3 | differently than praise from others. This was... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20... |
3 | ac72722a02ac71242a2a91fca323198d04197daf60515d... | 4 | contrast to the rigid silence enveloping the ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a... |
4 | 4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d... | 5 | a mask of duty.\n\nIn the midst of the descen... | 35 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [d084d615-3584-4ec8-9931-90aa6075c764, 4b84859... | [6efdc42e-69a2-47c0-97ec-4b296cd16d5e] | [db8da02f-f889-4bb5-8e81-ab2a72e380bb] |
In [3]:
Copied!
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
In [4]:
Copied!
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
In [5]:
Copied!
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
In [6]:
Copied!
resp = await search.search("Who is agent Mercer?")
resp = await search.search("Who is agent Mercer?")
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:05<00:00, 5.73s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:11<00:22, 11.27s/it]
67%|██████▋ | 2/3 [00:12<00:05, 5.55s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:08<00:17, 8.90s/it]
67%|██████▋ | 2/3 [00:09<00:03, 3.74s/it]
100%|██████████| 3/3 [00:09<00:00, 2.19s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:07<00:15, 7.76s/it]
67%|██████▋ | 2/3 [00:08<00:03, 3.67s/it]
100%|██████████| 3/3 [00:09<00:00, 2.61s/it]
In [7]:
Copied!
resp.response
resp.response
Out[7]:
"Agent Alex Mercer is a pivotal member of the Paranormal Military Squad, playing a significant role in Operation: Dulce. This operation involves the exploration and investigation of the Dulce base, a site rumored to house advanced alien technology. Mercer's involvement highlights their importance within the team and the broader mission [Data: Reports (1)].\n\n### Leadership and Mentorship\nMercer is noted for their leadership and mentorship qualities, providing guidance and support to other team members, particularly Sam Rivera, a cybersecurity expert. Their role is not only to explore the Dulce base but also to ensure that the team operates cohesively and effectively, which is crucial given the complex and potentially dangerous nature of the mission [Data: Reports (1)].\n\n### Relationship with Team Members\nMercer collaborates closely with other key figures, such as Agent Taylor Cruz, who is recognized for their authoritative leadership. The relationship between Mercer and Cruz is characterized by collaboration and mutual respect, balancing Mercer's intuitive leadership style with Cruz's structured approach [Data: Reports (1)].\n\n### Contributions to the Mission\nMercer's contributions are further emphasized by their interactions with other team members, such as Dr. Jordan Hayes and Sam Rivera. Hayes, a scientist with expertise in alien technology, and Rivera rely on Mercer's leadership to navigate the challenges posed by the Dulce base. Mercer's ability to mentor and guide these specialists is indicative of their integral role in the mission's success [Data: Reports (1)].\n\nOverall, Agent Mercer's involvement in Operation: Dulce is marked by their leadership, mentorship, and collaborative efforts, making them a key figure in the mission and the community's dynamics."
In [8]:
Copied!
print(resp.context_data)
print(resp.context_data)
{'What is the significance of the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, 'What role does Agent Cruz play in Operation: Dulce?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does Agent Mercer's leadership style differ from Agent Cruz's?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the potential implications of the alien technology at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, "How does Agent Cruz's leadership style affect the dynamics of the Paranormal Military Squad?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the potential scientific breakthroughs associated with the alien technology at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, 'What are the main challenges the team faces while exploring the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 1 , the hollow echo of the bay a stark reminder ... 2 3 contrast to the rigid silence enveloping the ... 3 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'How does the presence of alien technology impact the dynamics within the Paranormal Military Squad?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 3 contrast to the rigid silence enveloping the ... 2 2 differently than praise from others. This was... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What role does adaptability play in the success of Operation: Dulce?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}}