Skip to main content

autogen.retrieve_utils

num_tokens_from_text

def num_tokens_from_text(text: str, model: str = "gpt-3.5-turbo-0613", return_tokens_per_name_and_message: bool = False) -> Union[int, Tuple[int, int, int]]

Return the number of tokens used by a text.

num_tokens_from_messages

def num_tokens_from_messages(messages: dict, model: str = "gpt-3.5-turbo-0613")

Return the number of tokens used by a list of messages.

split_text_to_chunks

def split_text_to_chunks(text: str, max_tokens: int = 4000, chunk_mode: str = "multi_lines", must_break_at_empty_line: bool = True, overlap: int = 10)

Split a long text into chunks of max_tokens.

split_files_to_chunks

def split_files_to_chunks(files: list, max_tokens: int = 4000, chunk_mode: str = "multi_lines", must_break_at_empty_line: bool = True)

Split a list of files into chunks of max_tokens.

get_files_from_dir

def get_files_from_dir(dir_path: str, types: list = TEXT_FORMATS, recursive: bool = True)

Return a list of all the files in a given directory.

get_file_from_url

def get_file_from_url(url: str, save_path: str = None)

Download a file from a URL.

is_url

def is_url(string: str)

Return True if the string is a valid URL.

create_vector_db_from_dir

def create_vector_db_from_dir(dir_path: str, max_tokens: int = 4000, client: API = None, db_path: str = "/tmp/chromadb.db", collection_name: str = "all-my-documents", get_or_create: bool = False, chunk_mode: str = "multi_lines", must_break_at_empty_line: bool = True, embedding_model: str = "all-MiniLM-L6-v2")

Create a vector db from all the files in a given directory.

query_vector_db

def query_vector_db(query_texts: List[str], n_results: int = 10, client: API = None, db_path: str = "/tmp/chromadb.db", collection_name: str = "all-my-documents", search_string: str = "", embedding_model: str = "all-MiniLM-L6-v2") -> Dict[str, List[str]]

Query a vector db.