DRIFT Search
In [1]:
Copied!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
In [2]:
Copied!
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description', 'text_unit_ids', 'frequency', 'degree', 'x', 'y'], dtype='object') Entity count: 18 Relationship count: 54 Text unit records: 5
Out[2]:
id | human_readable_id | text | n_tokens | document_ids | entity_ids | relationship_ids | covariate_ids | |
---|---|---|---|---|---|---|---|---|
0 | 8e938693af886bfd081acbbe8384c3671446bff84a134a... | 1 | # Operation: Dulce\n\n## Chapter 1\n\nThe thru... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185... |
1 | fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443... | 2 | , the hollow echo of the bay a stark reminder ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618... |
2 | 7296d9a1f046854d59079dc183de8a054c27c4843d2979... | 3 | differently than praise from others. This was... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20... |
3 | ac72722a02ac71242a2a91fca323198d04197daf60515d... | 4 | contrast to the rigid silence enveloping the ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a... |
4 | 4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d... | 5 | a mask of duty.\n\nIn the midst of the descen... | 35 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [d084d615-3584-4ec8-9931-90aa6075c764, 4b84859... | [6efdc42e-69a2-47c0-97ec-4b296cd16d5e] | [db8da02f-f889-4bb5-8e81-ab2a72e380bb] |
In [3]:
Copied!
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
In [4]:
Copied!
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
In [5]:
Copied!
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
In [6]:
Copied!
resp = await search.search("Who is agent Mercer?")
resp = await search.search("Who is agent Mercer?")
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:13<00:00, 13.62s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:14<00:29, 14.73s/it]
67%|██████▋ | 2/3 [00:16<00:06, 6.85s/it]
100%|██████████| 3/3 [00:21<00:00, 6.40s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:10<00:20, 10.17s/it]
67%|██████▋ | 2/3 [00:10<00:04, 4.43s/it]
100%|██████████| 3/3 [00:11<00:00, 2.81s/it]
0%| | 0/3 [00:00<?, ?it/s]
33%|███▎ | 1/3 [00:09<00:18, 9.43s/it]
67%|██████▋ | 2/3 [00:12<00:05, 5.40s/it]
100%|██████████| 3/3 [00:13<00:00, 3.59s/it]
In [7]:
Copied!
resp.response
resp.response
Out[7]:
"Agent Alex Mercer is a pivotal member of the Paranormal Military Squad, playing a crucial role in Operation: Dulce. He is known for his leadership and mentorship abilities, which are essential to the success of the mission. Mercer's role involves guiding and supporting fellow team members, particularly Sam Rivera, and fostering trust and intuition within the team [Data: Reports (1); Sources (3, 0)].\n\nMercer is also noted for his determination and ability to maintain focus under pressure, as well as his reflective nature, which allows him to consider the broader implications of their mission beyond immediate objectives [Data: Sources (3)]. His interactions with other team members, such as Taylor Cruz and Jordan Hayes, highlight his ability to balance protocol with adaptability, encouraging open communication and collaboration [Data: Reports (1); Sources (0, 3)].\n\nOverall, Mercer's skills in leadership, mentorship, focus, and reflective thinking make him an invaluable asset to the Paranormal Military Squad, particularly in the complex and high-stakes environment of Operation: Dulce [Data: Reports (1); Sources (3)]."
In [8]:
Copied!
print(resp.context_data)
print(resp.context_data)
{'What specific skills does Agent Mercer bring to the Paranormal Military Squad?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the potential implications of the discoveries at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, "How do Agent Mercer's relationships with other team members influence the mission?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What specific technologies are being explored at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, 'What role does agent Mercer play in the exploration of the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does Mercer's mentorship impact Sam Rivera's performance during the mission?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "In what ways do Mercer's interactions with Taylor Cruz affect the team's decision-making process?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "What role does intuition play in Mercer's decision-making process?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "In what ways does Mercer's mentorship of Sam Rivera influence the team's operations?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}}