autogen_ext.models.llama_cpp#

class LlamaCppChatCompletionClient(model_info: ModelInfo | None = None, **kwargs: Unpack)[source]#

Bases: ChatCompletionClient

Chat completion client for LlamaCpp models. To use this client, you must install the llama-cpp extra:

pip install "autogen-ext[llama-cpp]"

This client allows you to interact with LlamaCpp models, either by specifying a local model path or by downloading a model from Hugging Face Hub.

Parameters:

model_info (optional, ModelInfo) – The information about the model. Defaults to DEFAULT_MODEL_INFO.
model_path (optional, str) – The path to the LlamaCpp model file. Required if repo_id and filename are not provided.
repo_id (optional, str) – The Hugging Face Hub repository ID. Required if model_path is not provided.
filename (optional, str) – The filename of the model within the Hugging Face Hub repository. Required if model_path is not provided.
n_gpu_layers (optional, int) – The number of layers to put on the GPU.
n_ctx (optional, int) – The context size.
n_batch (optional, int) – The batch size.
verbose (optional, bool) – Whether to print verbose output.
**kwargs – Additional parameters to pass to the Llama class.

Examples

The following code snippet shows how to use the client with a local model file:

import asyncio

from autogen_core.models import UserMessage
from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient


async def main():
    llama_client = LlamaCppChatCompletionClient(model_path="/path/to/your/model.gguf")
    result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")])
    print(result)


asyncio.run(main())

The following code snippet shows how to use the client with a model from Hugging Face Hub:

import asyncio

from autogen_core.models import UserMessage
from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient


async def main():
    llama_client = LlamaCppChatCompletionClient(
        repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000
    )
    result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")])
    print(result)


asyncio.run(main())

DEFAULT_MODEL_INFO: ModelInfo = {'family': 'unknown', 'function_calling': True, 'json_output': True, 'structured_output': True, 'vision': False}#

async create(messages: Sequence[Annotated[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage, FieldInfo(annotation=NoneType, required=True, discriminator='type')]], *, tools: Sequence[Tool | ToolSchema] = [], json_output: bool | type[BaseModel] | None = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: CancellationToken | None = None) → CreateResult[source]#

Creates a single response from the model.

Parameters:

messages (Sequence[LLMMessage]) – The messages to send to the model.
tools (Sequence[Tool | ToolSchema], optional) – The tools to use with the model. Defaults to [].
json_output (Optional[bool | type[BaseModel]], optional) – Whether to use JSON mode, structured output, or neither. Defaults to None. If set to a Pydantic BaseModel type, it will be used as the output type for structured output. If set to a boolean, it will be used to determine whether to use JSON mode or not. If set to True, make sure to instruct the model to produce JSON output in the instruction or prompt.
extra_create_args (Mapping[str, Any], optional) – Extra arguments to pass to the underlying client. Defaults to {}.
cancellation_token (Optional[CancellationToken], optional) – A token for cancellation. Defaults to None.

Returns:

CreateResult – The result of the model call.

async create_stream(messages: Sequence[Annotated[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage, FieldInfo(annotation=NoneType, required=True, discriminator='type')]], *, tools: Sequence[Tool | ToolSchema] = [], json_output: bool | type[BaseModel] | None = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: CancellationToken | None = None) → AsyncGenerator[str | CreateResult, None][source]#

Creates a stream of string chunks from the model ending with a CreateResult.

Parameters:

messages (Sequence[LLMMessage]) – The messages to send to the model.
tools (Sequence[Tool | ToolSchema], optional) – The tools to use with the model. Defaults to [].
json_output (Optional[bool | type[BaseModel]], optional) –
Whether to use JSON mode, structured output, or neither. Defaults to None. If set to a Pydantic BaseModel type, it will be used as the output type for structured output. If set to a boolean, it will be used to determine whether to use JSON mode or not. If set to True, make sure to instruct the model to produce JSON output in the instruction or prompt.
extra_create_args (Mapping[str, Any], optional) – Extra arguments to pass to the underlying client. Defaults to {}.
cancellation_token (Optional[CancellationToken], optional) – A token for cancellation. Defaults to None.

Returns:

AsyncGenerator[Union[str, CreateResult], None] – A generator that yields string chunks and ends with a CreateResult.

actual_usage() → RequestUsage[source]#

property capabilities: ModelInfo#

count_tokens(messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], **kwargs: Any) → int[source]#

property model_info: ModelInfo#

remaining_tokens(messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], **kwargs: Any) → int[source]#

total_usage() → RequestUsage[source]#

async close() → None[source]#: Close the LlamaCpp client.