autogen.retrieve_utils
num_tokens_from_text
def num_tokens_from_text(
text: str,
model: str = "gpt-3.5-turbo-0613",
return_tokens_per_name_and_message: bool = False
) -> Union[int, Tuple[int, int, int]]
Return the number of tokens used by a text.
num_tokens_from_messages
def num_tokens_from_messages(messages: dict,
model: str = "gpt-3.5-turbo-0613")
Return the number of tokens used by a list of messages.
split_text_to_chunks
def split_text_to_chunks(text: str,
max_tokens: int = 4000,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True,
overlap: int = 10)
Split a long text into chunks of max_tokens.
split_files_to_chunks
def split_files_to_chunks(files: list,
max_tokens: int = 4000,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True)
Split a list of files into chunks of max_tokens.
get_files_from_dir
def get_files_from_dir(dir_path: str,
types: list = TEXT_FORMATS,
recursive: bool = True)
Return a list of all the files in a given directory.
get_file_from_url
def get_file_from_url(url: str, save_path: str = None)
Download a file from a URL.
is_url
def is_url(string: str)
Return True if the string is a valid URL.
create_vector_db_from_dir
def create_vector_db_from_dir(dir_path: str,
max_tokens: int = 4000,
client: API = None,
db_path: str = "/tmp/chromadb.db",
collection_name: str = "all-my-documents",
get_or_create: bool = False,
chunk_mode: str = "multi_lines",
must_break_at_empty_line: bool = True,
embedding_model: str = "all-MiniLM-L6-v2")
Create a vector db from all the files in a given directory.
query_vector_db
def query_vector_db(
query_texts: List[str],
n_results: int = 10,
client: API = None,
db_path: str = "/tmp/chromadb.db",
collection_name: str = "all-my-documents",
search_string: str = "",
embedding_model: str = "all-MiniLM-L6-v2") -> Dict[str, List[str]]
Query a vector db.