agentchat.contrib.vectordb.mongodb
with_id_rename
def with_id_rename(docs: Iterable) -> List[Dict[str, Any]]
Utility changes _id field from Collection into id for Document.
MongoDBAtlasVectorDB
class MongoDBAtlasVectorDB(VectorDB)
A Collection object for MongoDB.
__init__
def __init__(connection_string: str = "",
database_name: str = "vector_db",
embedding_function: Callable = SentenceTransformer(
"all-MiniLM-L6-v2").encode,
collection_name: str = None,
index_name: str = "vector_index",
overwrite: bool = False,
wait_until_index_ready: float = None,
wait_until_document_ready: float = None)
Initialize the vector database.
Arguments:
connection_string
- str | The MongoDB connection string to connect to. Default is ''.database_name
- str | The name of the database. Default is 'vector_db'.embedding_function
- Callable | The embedding function used to generate the vector representation.collection_name
- str | The name of the collection to create for this vector database Defaults to Noneindex_name
- str | Index name for the vector database, defaults to 'vector_index'overwrite
- bool = Falsewait_until_index_ready
- float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.wait_until_document_ready
- float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.
list_collections
def list_collections()
List the collections in the vector database.
Returns:
List[str] | The list of collections.
create_collection
def create_collection(collection_name: str,
overwrite: bool = False,
get_or_create: bool = True) -> Collection
Create a collection in the vector database and create a vector search index in the collection.
Arguments:
collection_name
- str | The name of the collection.overwrite
- bool | Whether to overwrite the collection if it exists. Default is False.get_or_create
- bool | Whether to get or create the collection. Default is True
create_index_if_not_exists
def create_index_if_not_exists(index_name: str = "vector_index",
collection: Collection = None) -> None
Creates a vector search index on the specified collection in MongoDB.
Arguments:
MONGODB_INDEX
str, optional - The name of the vector search index to create. Defaults to "vector_search_index".collection
Collection, optional - The MongoDB collection to create the index on. Defaults to None.
get_collection
def get_collection(collection_name: str = None) -> Collection
Get the collection from the vector database.
Arguments:
collection_name
- str | The name of the collection. Default is None. If None, return the current active collection.
Returns:
Collection | The collection object.
delete_collection
def delete_collection(collection_name: str) -> None
Delete the collection from the vector database.
Arguments:
collection_name
- str | The name of the collection.
create_vector_search_index
def create_vector_search_index(
collection: Collection,
index_name: Union[str, None] = "vector_index",
similarity: Literal["euclidean", "cosine",
"dotProduct"] = "cosine") -> None
Create a vector search index in the collection.
Arguments:
collection
- An existing Collection in the Atlas Database.index_name
- Vector Search Index name.similarity
- Algorithm used for measuring vector similarity.kwargs
- Additional keyword arguments.
Returns:
None
insert_docs
def insert_docs(docs: List[Document],
collection_name: str = None,
upsert: bool = False,
batch_size=DEFAULT_INSERT_BATCH_SIZE,
**kwargs) -> None
Insert Documents and Vector Embeddings into the collection of the vector database.
For large numbers of Documents, insertion is performed in batches.
Arguments:
docs
- List[Document] | A list of documents. Each document is a TypedDictDocument
.collection_name
- str | The name of the collection. Default is None.upsert
- bool | Whether to update the document if it exists. Default is False.batch_size
- Number of documents to be inserted in each batch
update_docs
def update_docs(docs: List[Document],
collection_name: str = None,
**kwargs: Any) -> None
Update documents, including their embeddings, in the Collection.
Optionally allow upsert as kwarg.
Uses deepcopy to avoid changing docs.
Arguments:
docs
- List[Document] | A list of documents.collection_name
- str | The name of the collection. Default is None.kwargs
- Any | Use upsert=True` to insert documents whose ids are not present in collection.
delete_docs
def delete_docs(ids: List[ItemID], collection_name: str = None, **kwargs)
Delete documents from the collection of the vector database.
Arguments:
ids
- List[ItemID] | A list of document ids. Each id is a typedItemID
.collection_name
- str | The name of the collection. Default is None.
get_docs_by_ids
def get_docs_by_ids(ids: List[ItemID] = None,
collection_name: str = None,
include: List[str] = None,
**kwargs) -> List[Document]
Retrieve documents from the collection of the vector database based on the ids.
Arguments:
ids
- List[ItemID] | A list of document ids. If None, will return all the documents. Default is None.collection_name
- str | The name of the collection. Default is None.include
- List[str] | The fields to include. If None, will include ["metadata", "content"], ids will always be included. Basically, use include to choose whether to include embedding and metadatakwargs
- dict | Additional keyword arguments.
Returns:
List[Document] | The results.
retrieve_docs
def retrieve_docs(queries: List[str],
collection_name: str = None,
n_results: int = 10,
distance_threshold: float = -1,
**kwargs) -> QueryResults
Retrieve documents from the collection of the vector database based on the queries.
Arguments:
queries
- List[str] | A list of queries. Each query is a string.collection_name
- str | The name of the collection. Default is None.n_results
- int | The number of relevant documents to return. Default is 10.distance_threshold
- float | The threshold for the distance score, only distance smaller than it will be returned. Don't filter with it if < 0. Default is -1.kwargs
- Dict | Additional keyword arguments. Ones of importance follow:oversampling_factor
- int | This times n_results is 'ef' in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. A higher value leads to more accuracy, but is slower. Default is 10
Returns:
QueryResults | For each query string, a list of nearest documents and their scores.