DRIFT Search
In [1]:
Copied!
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
In [2]:
Copied!
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
import os
from pathlib import Path
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
read_indexer_entities,
read_indexer_relationships,
read_indexer_report_embeddings,
read_indexer_reports,
read_indexer_text_units,
)
from graphrag.query.structured_search.drift_search.drift_context import (
DRIFTSearchContextBuilder,
)
from graphrag.query.structured_search.drift_search.search import DRIFTSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
INPUT_DIR = "./inputs/operation dulce"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_REPORT_TABLE = "community_reports"
COMMUNITY_TABLE = "communities"
ENTITY_TABLE = "entities"
RELATIONSHIP_TABLE = "relationships"
COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
print(f"Entity df columns: {entity_df.columns}")
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
full_content_embedding_store = LanceDBVectorStore(
collection_name="default-community-full_content",
)
full_content_embedding_store.connect(db_uri=LANCEDB_URI)
print(f"Entity count: {len(entity_df)}")
entity_df.head()
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
Entity df columns: Index(['id', 'human_readable_id', 'title', 'type', 'description', 'text_unit_ids', 'frequency', 'degree', 'x', 'y'], dtype='object') Entity count: 18 Relationship count: 54 Text unit records: 5
Out[2]:
id | human_readable_id | text | n_tokens | document_ids | entity_ids | relationship_ids | covariate_ids | |
---|---|---|---|---|---|---|---|---|
0 | 8e938693af886bfd081acbbe8384c3671446bff84a134a... | 1 | # Operation: Dulce\n\n## Chapter 1\n\nThe thru... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [745d28dd-be20-411b-85ff-1c69ca70e7b3, 9cba185... |
1 | fd1f46d32e1df6cd429542aeda3d64ddf3745ccb80f443... | 2 | , the hollow echo of the bay a stark reminder ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [4f9b461f-5e8f-465d-9586-e2fc81787062, 0f74618... |
2 | 7296d9a1f046854d59079dc183de8a054c27c4843d2979... | 3 | differently than praise from others. This was... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [3ef1be9c-4080-4fac-99bd-c4a636248904, 8730b20... |
3 | ac72722a02ac71242a2a91fca323198d04197daf60515d... | 4 | contrast to the rigid silence enveloping the ... | 1200 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [425a7862-0aef-4f69-a4c8-8bd42151c9d4, bcdbf1f... | [2bfad9f4-5abd-48d0-8db3-a9cad9120413, 6cbb838... | [2c292047-b79a-4958-ab57-7bf7d7a22c92, 3cbd18a... |
4 | 4c277337d461a16aaf8f9760ddb8b44ef220e948a2341d... | 5 | a mask of duty.\n\nIn the midst of the descen... | 35 | [6e81f882f89dd5596e1925dd3ae8a4f0a0edcb55b35a8... | [d084d615-3584-4ec8-9931-90aa6075c764, 4b84859... | [6efdc42e-69a2-47c0-97ec-4b296cd16d5e] | [db8da02f-f889-4bb5-8e81-ab2a72e380bb] |
In [3]:
Copied!
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
chat_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIChat,
model=llm_model,
max_retries=20,
)
chat_model = ModelManager().get_or_create_chat_model(
name="local_search",
model_type=ModelType.OpenAIChat,
config=chat_config,
)
token_encoder = tiktoken.encoding_for_model(llm_model)
embedding_config = LanguageModelConfig(
api_key=api_key,
type=ModelType.OpenAIEmbedding,
model=embedding_model,
max_retries=20,
)
text_embedder = ModelManager().get_or_create_embedding_model(
name="local_search_embedding",
model_type=ModelType.OpenAIEmbedding,
config=embedding_config,
)
In [4]:
Copied!
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
def read_community_reports(
input_dir: str,
community_report_table: str = COMMUNITY_REPORT_TABLE,
):
"""Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path."""
input_path = Path(input_dir) / f"{community_report_table}.parquet"
return pd.read_parquet(input_path)
report_df = read_community_reports(INPUT_DIR)
reports = read_indexer_reports(
report_df,
community_df,
COMMUNITY_LEVEL,
content_embedding_col="full_content_embeddings",
)
read_indexer_report_embeddings(reports, full_content_embedding_store)
In [5]:
Copied!
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
drift_params = DRIFTSearchConfig(
temperature=0,
max_tokens=12_000,
primer_folds=1,
drift_k_followups=3,
n_depth=3,
n=1,
)
context_builder = DRIFTSearchContextBuilder(
model=chat_model,
text_embedder=text_embedder,
entities=entities,
relationships=relationships,
reports=reports,
entity_text_embeddings=description_embedding_store,
text_units=text_units,
token_encoder=token_encoder,
config=drift_params,
)
search = DRIFTSearch(
model=chat_model, context_builder=context_builder, token_encoder=token_encoder
)
In [6]:
Copied!
resp = await search.search("Who is agent Mercer?")
resp = await search.search("Who is agent Mercer?")
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:07<00:00, 7.83s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:14<00:29, 14.92s/it]
67%|██████▋ | 2/3 [00:17<00:07, 7.46s/it]
100%|██████████| 3/3 [00:21<00:00, 6.15s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:06<00:13, 6.57s/it]
67%|██████▋ | 2/3 [00:10<00:05, 5.20s/it]
100%|██████████| 3/3 [00:14<00:00, 4.50s/it]
0%| | 0/3 [00:00<?, ?it/s]
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
Reached token limit - reverting to previous context state
33%|███▎ | 1/3 [00:08<00:16, 8.11s/it]
67%|██████▋ | 2/3 [00:08<00:03, 3.77s/it]
100%|██████████| 3/3 [00:14<00:00, 4.59s/it]
In [7]:
Copied!
resp.response
resp.response
Out[7]:
"Agent Alex Mercer is a pivotal member of the Paranormal Military Squad, a specialized team involved in Operation: Dulce, which focuses on exploring the Dulce base rumored to house advanced alien technology. Mercer's role is characterized by his leadership, mentorship, and collaborative efforts with other key agents [Data: Reports (1); Sources (1, 3)].\n\n### Leadership and Mentorship\nMercer is noted for his leadership qualities, particularly his ability to mentor fellow agents like Sam Rivera. He emphasizes the importance of intuition and trust, fostering a collaborative environment within the team [Data: Reports (1); Sources (1, 3)]. His mentorship style combines respect for technical abilities with guidance on broader strategic thinking [Data: Sources (1, 3)].\n\n### Collaborative Efforts\nMercer works closely with other team members, such as Agent Taylor Cruz and Dr. Jordan Hayes, balancing compliance with protocol and the pursuit of deeper understanding. His interactions often involve philosophical discussions about the mission's implications, indicating a reflective and strategic mindset [Data: Reports (1); Sources (0, 3)].\n\n### Role in Operation: Dulce\nMercer's contributions are integral to the mission's success, as he helps the team navigate the complexities of the Dulce base and understand the alien technology it may contain. His leadership and the interconnectedness of the team members are crucial for achieving the mission's objectives [Data: Reports (1); Sources (1, 3)].\n\nOverall, Agent Mercer is a key figure in the Paranormal Military Squad, known for his leadership, mentorship, and strategic thinking, which are vital to the success of Operation: Dulce."
In [8]:
Copied!
print(resp.context_data)
print(resp.context_data)
{'What specific skills does Agent Mercer bring to the Paranormal Military Squad?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does Agent Mercer's leadership style differ from Agent Cruz's?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 3 contrast to the rigid silence enveloping the ... 3 1 , the hollow echo of the bay a stark reminder ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'How does the relationship between Agent Mercer and Sam Rivera impact the mission?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What challenges does the team face during Operation: Dulce?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, 'What specific skills does Sam Rivera bring to the mission?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 1 2 differently than praise from others. This was... 2 1 , the hollow echo of the bay a stark reminder ... 3 3 contrast to the rigid silence enveloping the ... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "How does the team's preparation and coordination impact their mission success?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 1 , the hollow echo of the bay a stark reminder ... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, "What specific instances demonstrate Mercer's mentorship of Sam Rivera?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 3 contrast to the rigid silence enveloping the ... 1 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 2 1 , the hollow echo of the bay a stark reminder ... 3 2 differently than praise from others. This was... 4 4 a mask of duty.\n\nIn the midst of the descen...}, 'What are the implications of the alien technology found at the Dulce base?': {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 2 differently than praise from others. This was... 1 3 contrast to the rigid silence enveloping the ... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 4 a mask of duty.\n\nIn the midst of the descen... 4 1 , the hollow echo of the bay a stark reminder ...}, "How did the team's preparation differ from other missions?": {'reports': id title \ 0 1 Paranormal Military Squad and Operation: Dulce content 0 # Paranormal Military Squad and Operation: Dul... , 'entities': Empty DataFrame Columns: [in_context] Index: [], 'sources': id text 0 1 , the hollow echo of the bay a stark reminder ... 1 2 differently than praise from others. This was... 2 0 # Operation: Dulce\n\n## Chapter 1\n\nThe thru... 3 3 contrast to the rigid silence enveloping the ... 4 4 a mask of duty.\n\nIn the midst of the descen...}}