Source code for autogen_ext.agents.video_surfer._video_surfer

from typing import Any, Awaitable, Callable, List, Optional

from autogen_agentchat.agents import AssistantAgent
from autogen_core.components.models import ChatCompletionClient
from autogen_core.components.tools import Tool

from .tools import (
    extract_audio,
    get_screenshot_at,
    get_video_length,
    save_screenshot,
    transcribe_audio_with_timestamps,
    transcribe_video_screenshot,
)



[docs]
class VideoSurfer(AssistantAgent):
    """
    VideoSurfer is a specialized agent designed to answer questions about a local video file.

    This agent utilizes various tools to extract information from the video, such as its length, screenshots at specific timestamps, and audio transcriptions. It processes these elements to provide detailed answers to user queries.

    Available tools:

    - :func:`~autogen_ext.agents.video_surfer.tools.extract_audio`
    - :func:`~autogen_ext.agents.video_surfer.tools.get_video_length`
    - :func:`~autogen_ext.agents.video_surfer.tools.transcribe_audio_with_timestamps`
    - :func:`~autogen_ext.agents.video_surfer.tools.get_screenshot_at`
    - :func:`~autogen_ext.agents.video_surfer.tools.save_screenshot`
    - :func:`~autogen_ext.agents.video_surfer.tools.transcribe_video_screenshot`

    Example usage:

        The following example demonstrates how to create an video surfing agent with
        a model client and generate a response to a simple query about a local video
        called video.mp4.

        .. code-block:: python


            import asyncio
            from autogen_agentchat.ui import Console
            from autogen_agentchat.conditions import TextMentionTermination
            from autogen_agentchat.teams import RoundRobinGroupChat
            from autogen_ext.models import OpenAIChatCompletionClient
            from autogen_ext.agents.video_surfer import VideoSurfer

            async def main() -> None:
                \"\"\"
                Main function to run the video agent.
                \"\"\"
                # Define an agent
                video_agent = VideoSurfer(
                    name="VideoSurfer",
                    model_client=OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")
                    )

                # Define termination condition
                termination = TextMentionTermination("TERMINATE")

                # Define a team
                agent_team = RoundRobinGroupChat([video_agent], termination_condition=termination)

                # Run the team and stream messages to the console
                stream = agent_team.run_stream(task="How does Adam define complex tasks in video.mp4? What concrete example of complex does his use? Can you save this example to disk as well?")
                await Console(stream)

            asyncio.run(main())

        The following example demonstrates how to create and use a VideoSurfer and UserProxyAgent with MagenticOneGroupChat.

        .. code-block:: python

            import asyncio

            from autogen_agentchat.ui import Console
            from autogen_agentchat.teams import MagenticOneGroupChat
            from autogen_agentchat.agents import UserProxyAgent
            from autogen_ext.models import OpenAIChatCompletionClient
            from autogen_ext.agents.video_surfer import VideoSurfer

            async def main() -> None:
                \"\"\"
                Main function to run the video agent.
                \"\"\"

                model_client = OpenAIChatCompletionClient(model="gpt-4o-2024-08-06")

                # Define an agent
                video_agent = VideoSurfer(
                    name="VideoSurfer",
                    model_client=model_client
                    )

                web_surfer_agent = UserProxyAgent(
                    name="User"
                )

                # Define a team
                agent_team = MagenticOneGroupChat([web_surfer_agent, video_agent], model_client=model_client,)

                # Run the team and stream messages to the console
                stream = agent_team.run_stream(task="Find a latest video about magentic one on youtube and extract quotes from it that make sense.")
                await Console(stream)

            asyncio.run(main())
    """

    DEFAULT_DESCRIPTION = "An agent that can answer questions about a local video."

    DEFAULT_SYSTEM_MESSAGE = """
    You are a helpful agent that is an expert at answering questions from a video.
    When asked to answer a question about a video, you should:
    1. Check if that video is available locally.
    2. Use the transcription to find which part of the video the question is referring to.
    3. Optionally use screenshots from those timestamps
    4. Provide a detailed answer to the question.
    Reply with TERMINATE when the task has been completed.
    """

    def __init__(
        self,
        name: str,
        model_client: ChatCompletionClient,
        *,
        tools: List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None = None,
        description: Optional[str] = None,
        system_message: Optional[str] = None,
    ):
        """
        Initialize the VideoSurfer.

        Args:
            name (str): The name of the agent.
            model_client (ChatCompletionClient): The model client used for generating responses.
            tools (List[Tool | Callable[..., Any] | Callable[..., Awaitable[Any]]] | None, optional):
                A list of tools or functions the agent can use. If not provided, defaults to all video tools from the action space.
            description (str, optional): A brief description of the agent. Defaults to "An agent that can answer questions about a local video.".
            system_message (str | None, optional): The system message guiding the agent's behavior. Defaults to a predefined message.
        """
        super().__init__(
            name=name,
            model_client=model_client,
            tools=tools
            or [
                get_video_length,
                get_screenshot_at,
                save_screenshot,
                self.vs_transribe_video_screenshot,
                extract_audio,
                transcribe_audio_with_timestamps,
            ],
            description=description or self.DEFAULT_DESCRIPTION,
            system_message=system_message or self.DEFAULT_SYSTEM_MESSAGE,
        )


[docs]
    async def vs_transribe_video_screenshot(self, video_path: str, timestamp: float) -> str:
        """
        Transcribes the video screenshot at a specific timestamp.

        Args:
            video_path (str): Path to the video file.
            timestamp (float): Timestamp to take the screenshot.

        Returns:
            str: Transcription of the video screenshot.
        """
        return await transcribe_video_screenshot(video_path, timestamp, self._model_client)