agentchat.contrib.vectordb.mongodb
with_id_rename
def with_id_rename(docs: Iterable) -> List[Dict[str, Any]]
Utility changes _id field from Collection into id for Document.
MongoDBAtlasVectorDB
class MongoDBAtlasVectorDB(VectorDB)
A Collection object for MongoDB.
__init__
def __init__(connection_string: str = "",
database_name: str = "vector_db",
embedding_function: Callable = SentenceTransformer(
"all-MiniLM-L6-v2").encode,
collection_name: str = None,
index_name: str = "vector_index",
overwrite: bool = False,
wait_until_index_ready: float = None,
wait_until_document_ready: float = None)
Initialize the vector database.
Arguments:
connection_string
- str | The MongoDB connection string to connect to. Default is ''.database_name
- str | The name of the database. Default is 'vector_db'.embedding_function
- Callable | The embedding function used to generate the vector representation.collection_name
- str | The name of the collection to create for this vector database Defaults to Noneindex_name
- str | Index name for the vector database, defaults to 'vector_index'overwrite
- bool = Falsewait_until_index_ready
- float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.wait_until_document_ready
- float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.
list_collections
def list_collections()
List the collections in the vector database.
Returns:
List[str] | The list of collections.
create_collection
def create_collection(collection_name: str,
overwrite: bool = False,
get_or_create: bool = True) -> Collection
Create a collection in the vector database and create a vector search index in the collection.
Arguments:
collection_name
- str | The name of the collection.overwrite
- bool | Whether to overwrite the collection if it exists. Default is False.get_or_create
- bool | Whether to get or create the collection. Default is True
create_index_if_not_exists
def create_index_if_not_exists(index_name: str = "vector_index",
collection: Collection = None) -> None
Creates a vector search index on the specified collection in MongoDB.
Arguments:
MONGODB_INDEX
str, optional - The name of the vector search index to create. Defaults to "vector_search_index".collection
Collection, optional - The MongoDB collection to create the index on. Defaults to None.
get_collection
def get_collection(collection_name: str = None) -> Collection
Get the collection from the vector database.
Arguments:
collection_name
- str | The name of the collection. Default is None. If None, return the current active collection.
Returns:
Collection | The collection object.
delete_collection
def delete_collection(collection_name: str) -> None
Delete the collection from the vector database.
Arguments:
collection_name
- str | The name of the collection.
create_vector_search_index
def create_vector_search_index(
collection: Collection,
index_name: Union[str, None] = "vector_index",
similarity: Literal["euclidean", "cosine",
"dotProduct"] = "cosine") -> None
Create a vector search index in the collection.
Arguments:
collection
- An existing Collection in the Atlas Database.index_name
- Vector Search Index name.similarity
- Algorithm used for measuring vector similarity.kwargs
- Additional keyword arguments.
Returns:
None
insert_docs
def insert_docs(docs: List[Document],
collection_name: str = None,
upsert: bool = False,
batch_size=DEFAULT_INSERT_BATCH_SIZE,
**kwargs) -> None
Insert Documents and Vector Embeddings into the collection of the vector database.
For large numbers of Documents, insertion is performed in batches.
Documents are recommended to not have an ID field, as the method will generate Hashed ID's for them.
Arguments:
docs
- List[Document] | A list of documents. Each document is a TypedDictDocument
, which may contain an ID. Documents without ID's will have them generated.collection_name
- str | The name of the collection. Default is None.upsert
- bool | Whether to update the document if it exists. Default is False.batch_size
- Number of documents to be inserted in each batchkwargs
- Additional keyword arguments. Usehash_length
to set the length of the hash generated ID's, useoverwrite_ids
to overwrite existing ID's with Hashed Values.
update_docs
def update_docs(docs: List[Document],
collection_name: str = None,
**kwargs: Any) -> None
Update documents, including their embeddings, in the Collection.
Optionally allow upsert as kwarg.
Uses deepcopy to avoid changing docs.
Arguments:
docs
- List[Document] | A list of documents, with ID, to ensure the correct document is updated.collection_name
- str | The name of the collection. Default is None.kwargs
- Any | Use upsert=True` to insert documents whose ids are not present in collection.
delete_docs
def delete_docs(ids: List[ItemID], collection_name: str = None, **kwargs)
Delete documents from the collection of the vector database.
Arguments:
ids
- List[ItemID] | A list of document ids. Each id is a typedItemID
.collection_name
- str | The name of the collection. Default is None.
get_docs_by_ids
def get_docs_by_ids(ids: List[ItemID] = None,
collection_name: str = None,
include: List[str] = None,
**kwargs) -> List[Document]
Retrieve documents from the collection of the vector database based on the ids.
Arguments:
ids
- List[ItemID] | A list of document ids. If None, will return all the documents. Default is None.collection_name
- str | The name of the collection. Default is None.include
- List[str] | The fields to include. If None, will include ["metadata", "content"], ids will always be included. Basically, use include to choose whether to include embedding and metadatakwargs
- dict | Additional keyword arguments.
Returns:
List[Document] | The results.
retrieve_docs
def retrieve_docs(queries: List[str],
collection_name: str = None,
n_results: int = 10,
distance_threshold: float = -1,
**kwargs) -> QueryResults
Retrieve documents from the collection of the vector database based on the queries.
Arguments:
queries
- List[str] | A list of queries. Each query is a string.collection_name
- str | The name of the collection. Default is None.n_results
- int | The number of relevant documents to return. Default is 10.distance_threshold
- float | The threshold for the distance score, only distance smaller than it will be returned. Don't filter with it if < 0. Default is -1.kwargs
- Dict | Additional keyword arguments. Ones of importance follow:oversampling_factor
- int | This times n_results is 'ef' in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. A higher value leads to more accuracy, but is slower. Default is 10
Returns:
QueryResults | For each query string, a list of nearest documents and their scores.