autogen_ext.models.llama_cpp#

class LlamaCppChatCompletionClient(model_info: ModelInfo | None = None, **kwargs: Unpack)[source]#

Bases: ChatCompletionClient

Chat completion client for LlamaCpp models. To use this client, you must install the llama-cpp extra:

pip install "autogen-ext[llama-cpp]"

This client allows you to interact with LlamaCpp models, either by specifying a local model path or by downloading a model from Hugging Face Hub.

Parameters:

model_path (optional, str) – The path to the LlamaCpp model file. Required if repo_id and filename are not provided.
repo_id (optional, str) – The Hugging Face Hub repository ID. Required if model_path is not provided.
filename (optional, str) – The filename of the model within the Hugging Face Hub repository. Required if model_path is not provided.
n_gpu_layers (optional, int) – The number of layers to put on the GPU.
n_ctx (optional, int) – The context size.
n_batch (optional, int) – The batch size.
verbose (optional, bool) – Whether to print verbose output.
model_info (optional, ModelInfo) – The capabilities of the model. Defaults to a ModelInfo instance with function_calling set to True.
**kwargs – Additional parameters to pass to the Llama class.

Examples

The following code snippet shows how to use the client with a local model file:

import asyncio

from autogen_core.models import UserMessage
from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient


async def main():
    llama_client = LlamaCppChatCompletionClient(model_path="/path/to/your/model.gguf")
    result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")])
    print(result)


asyncio.run(main())

The following code snippet shows how to use the client with a model from Hugging Face Hub:

import asyncio

from autogen_core.models import UserMessage
from autogen_ext.models.llama_cpp import LlamaCppChatCompletionClient


async def main():
    llama_client = LlamaCppChatCompletionClient(
        repo_id="unsloth/phi-4-GGUF", filename="phi-4-Q2_K_L.gguf", n_gpu_layers=-1, seed=1337, n_ctx=5000
    )
    result = await llama_client.create([UserMessage(content="What is the capital of France?", source="user")])
    print(result)


asyncio.run(main())

async create(messages: Sequence[Annotated[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage, FieldInfo(annotation=NoneType, required=True, discriminator='type')]], *, tools: Sequence[Tool | ToolSchema] = [], json_output: bool | None = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: CancellationToken | None = None) → CreateResult[source]#

async create_stream(messages: Sequence[Annotated[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage, FieldInfo(annotation=NoneType, required=True, discriminator='type')]], *, tools: Sequence[Tool | ToolSchema] = [], json_output: bool | None = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: CancellationToken | None = None) → AsyncGenerator[str | CreateResult, None][source]#

actual_usage() → RequestUsage[source]#

property capabilities: ModelInfo#

count_tokens(messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], **kwargs: Any) → int[source]#

property model_info: ModelInfo#

remaining_tokens(messages: Sequence[SystemMessage | UserMessage | AssistantMessage | FunctionExecutionResultMessage], **kwargs: Any) → int[source]#

total_usage() → RequestUsage[source]#

async close() → None[source]#: Close the LlamaCpp client.