Skip to main content

autogen.retrieve_utils

num_tokens_from_text

def num_tokens_from_text(
text: str,
model: str = "gpt-3.5-turbo-0613",
return_tokens_per_name_and_message: bool = False
) -> Union[int, Tuple[int, int, int]]

Return the number of tokens used by a text.

num_tokens_from_messages

def num_tokens_from_messages(messages: dict,
model: str = "gpt-3.5-turbo-0613")

Return the number of tokens used by a list of messages.

split_text_to_chunks

def split_text_to_chunks(text: str,
max_tokens: int = 4000,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True,
overlap: int = 10)

Split a long text into chunks of max_tokens.

split_files_to_chunks

def split_files_to_chunks(files: list,
max_tokens: int = 4000,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True)

Split a list of files into chunks of max_tokens.

get_files_from_dir

def get_files_from_dir(dir_path: str,
types: list = TEXT_FORMATS,
recursive: bool = True)

Return a list of all the files in a given directory.

get_file_from_url

def get_file_from_url(url: str, save_path: str = None)

Download a file from a URL.

is_url

def is_url(string: str)

Return True if the string is a valid URL.

create_vector_db_from_dir

def create_vector_db_from_dir(dir_path: str,
max_tokens: int = 4000,
client: API = None,
db_path: str = "/tmp/chromadb.db",
collection_name: str = "all-my-documents",
get_or_create: bool = False,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True,
embedding_model: str = "all-MiniLM-L6-v2")

Create a vector db from all the files in a given directory.

query_vector_db

def query_vector_db(
query_texts: List[str],
n_results: int = 10,
client: API = None,
db_path: str = "/tmp/chromadb.db",
collection_name: str = "all-my-documents",
search_string: str = "",
embedding_model: str = "all-MiniLM-L6-v2") -> Dict[str, List[str]]

Query a vector db.