Source code for autogen_ext.models._reply_chat_completion_client

from __future__ import annotations

import logging
from typing import Any, AsyncGenerator, List, Mapping, Optional, Sequence, Union

from autogen_core.application.logging import EVENT_LOGGER_NAME
from autogen_core.base import CancellationToken
from autogen_core.components.models import (
    ChatCompletionClient,
    CreateResult,
    LLMMessage,
    ModelCapabilities,
    RequestUsage,
)
from autogen_core.components.tools import Tool, ToolSchema

logger = logging.getLogger(EVENT_LOGGER_NAME)


[docs] class ReplayChatCompletionClient: """ A mock chat completion client that replays predefined responses using an index-based approach. This class simulates a chat completion client by replaying a predefined list of responses. It supports both single completion and streaming responses. The responses can be either strings or CreateResult objects. The client now uses an index-based approach to access the responses, allowing for resetting the state. .. note:: The responses can be either strings or CreateResult objects. Args: chat_completions (Sequence[Union[str, CreateResult]]): A list of predefined responses to replay. Raises: ValueError("No more mock responses available"): If the list of provided outputs are exhausted. Examples: Simple chat completion client to return pre-defined responses. .. code-block:: python chat_completions = [ "Hello, how can I assist you today?", "I'm happy to help with any questions you have.", "Is there anything else I can assist you with?", ] client = ReplayChatCompletionClient(chat_completions) messages = [LLMMessage(content="What can you do?")] response = await client.create(messages) print(response.content) # Output: "Hello, how can I assist you today?" Simple streaming chat completion client to return pre-defined responses .. code-block:: python chat_completions = [ "Hello, how can I assist you today?", "I'm happy to help with any questions you have.", "Is there anything else I can assist you with?", ] client = ReplayChatCompletionClient(chat_completions) async for token in client.create_stream(messages): print(token, end="") # Output: "Hello, how can I assist you today?" async for token in client.create_stream(messages): print(token, end="") # Output: "I'm happy to help with any questions you have." Using `.reset` to reset the chat client state .. code-block:: python chat_completions = [ "Hello, how can I assist you today?", ] client = ReplayChatCompletionClient(chat_completions) messages = [LLMMessage(content="What can you do?")] response = await client.create(messages) print(response.content) # Output: "Hello, how can I assist you today?" response = await client.create(messages) # Raises ValueError("No more mock responses available") client.reset() # Reset the client state (current index of message and token usages) response = await client.create(messages) print(response.content) # Output: "Hello, how can I assist you today?" again """ __protocol__: ChatCompletionClient # TODO: Support FunctionCall in responses # TODO: Support logprobs in Responses # TODO: Support model capabilities def __init__( self, chat_completions: Sequence[Union[str, CreateResult]], ): self.chat_completions = list(chat_completions) self.provided_message_count = len(self.chat_completions) self._model_capabilities = ModelCapabilities(vision=False, function_calling=False, json_output=False) self._total_available_tokens = 10000 self._cur_usage = RequestUsage(prompt_tokens=0, completion_tokens=0) self._total_usage = RequestUsage(prompt_tokens=0, completion_tokens=0) self._current_index = 0
[docs] async def create( self, messages: Sequence[LLMMessage], tools: Sequence[Tool | ToolSchema] = [], json_output: Optional[bool] = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: Optional[CancellationToken] = None, ) -> CreateResult: """Return the next completion from the list.""" if self._current_index >= len(self.chat_completions): raise ValueError("No more mock responses available") response = self.chat_completions[self._current_index] _, prompt_token_count = self._tokenize(messages) if isinstance(response, str): _, output_token_count = self._tokenize(response) self._cur_usage = RequestUsage(prompt_tokens=prompt_token_count, completion_tokens=output_token_count) response = CreateResult(finish_reason="stop", content=response, usage=self._cur_usage, cached=True) else: self._cur_usage = RequestUsage( prompt_tokens=prompt_token_count, completion_tokens=response.usage.completion_tokens ) self._update_total_usage() self._current_index += 1 return response
[docs] async def create_stream( self, messages: Sequence[LLMMessage], tools: Sequence[Tool | ToolSchema] = [], json_output: Optional[bool] = None, extra_create_args: Mapping[str, Any] = {}, cancellation_token: Optional[CancellationToken] = None, ) -> AsyncGenerator[Union[str, CreateResult], None]: """Return the next completion as a stream.""" if self._current_index >= len(self.chat_completions): raise ValueError("No more mock responses available") response = self.chat_completions[self._current_index] _, prompt_token_count = self._tokenize(messages) if isinstance(response, str): output_tokens, output_token_count = self._tokenize(response) self._cur_usage = RequestUsage(prompt_tokens=prompt_token_count, completion_tokens=output_token_count) for i, token in enumerate(output_tokens): if i < len(output_tokens) - 1: yield token + " " else: yield token self._update_total_usage() else: self._cur_usage = RequestUsage( prompt_tokens=prompt_token_count, completion_tokens=response.usage.completion_tokens ) yield response self._update_total_usage() self._current_index += 1
[docs] def actual_usage(self) -> RequestUsage: return self._cur_usage
[docs] def total_usage(self) -> RequestUsage: return self._total_usage
[docs] def count_tokens(self, messages: Sequence[LLMMessage], tools: Sequence[Tool | ToolSchema] = []) -> int: _, token_count = self._tokenize(messages) return token_count
[docs] def remaining_tokens(self, messages: Sequence[LLMMessage], tools: Sequence[Tool | ToolSchema] = []) -> int: return max( 0, self._total_available_tokens - self._total_usage.prompt_tokens - self._total_usage.completion_tokens )
def _tokenize(self, messages: Union[str, LLMMessage, Sequence[LLMMessage]]) -> tuple[list[str], int]: total_tokens = 0 all_tokens: List[str] = [] if isinstance(messages, str): tokens = messages.split() total_tokens += len(tokens) all_tokens.extend(tokens) elif hasattr(messages, "content"): if isinstance(messages.content, str): # type: ignore [reportAttributeAccessIssue] tokens = messages.content.split() # type: ignore [reportAttributeAccessIssue] total_tokens += len(tokens) all_tokens.extend(tokens) else: logger.warning("Token count has been done only on string content", RuntimeWarning) elif isinstance(messages, Sequence): for message in messages: if isinstance(message.content, str): # type: ignore [reportAttributeAccessIssue, union-attr] tokens = message.content.split() # type: ignore [reportAttributeAccessIssue, union-attr] total_tokens += len(tokens) all_tokens.extend(tokens) else: logger.warning("Token count has been done only on string content", RuntimeWarning) return all_tokens, total_tokens def _update_total_usage(self) -> None: self._total_usage.completion_tokens += self._cur_usage.completion_tokens self._total_usage.prompt_tokens += self._cur_usage.prompt_tokens @property def capabilities(self) -> ModelCapabilities: """Return mock capabilities.""" return self._model_capabilities
[docs] def reset(self) -> None: """Reset the client state and usage to its initial state.""" self._cur_usage = RequestUsage(prompt_tokens=0, completion_tokens=0) self._total_usage = RequestUsage(prompt_tokens=0, completion_tokens=0) self._current_index = 0