Basic Prompt Engineering

8. Basic Prompt Engineering#

SAMMO has a variety of tools that make trying out different sets of prompts easy. Let’s start by loading the same task as before.

Hide code cell content
# %load -r 3:25 _init.py
import pathlib
import sammo
from sammo.runners import OpenAIChat
from sammo.base import Template, EvaluationScore
from sammo.components import Output, GenerateText, ForEach, Union
from sammo.extractors import ExtractRegex
from sammo.data import DataTable
import json
import requests
import os

if not 'OPENAI_API_KEY' in os.environ:
    raise ValueError("Please set the environment variable 'OPENAI_API_KEY'.")

_ = sammo.setup_logger("WARNING")  # we're only interested in warnings for now

runner = OpenAIChat(
    model_id="gpt-3.5-turbo",
    api_config={"api_key": os.environ['OPENAI_API_KEY']},
    cache=os.getenv("CACHE_FILE", "cache.tsv"),
    timeout=30,
)
Hide code cell source
import json
import requests


def load_data(
    url="https://github.com/google/BIG-bench/raw/main/bigbench/benchmark_tasks/implicatures/task.json",
):
    task = json.loads(requests.get(url).content)
    # convert label to single string
    for x in task["examples"]:
        x["output"] = max(x["target_scores"], key=x["target_scores"].get)

    return DataTable.from_records(
        task["examples"],
        input_fields="input",
        constants={"instructions": task["task_prefix"]},
    )


mydata = load_data()

Let’s say we want to try out different instructions. For that, let’s define an objective.

# %load -s accuracy _init.py
def accuracy(y_true: DataTable, y_pred: DataTable) -> EvaluationScore:
    y_true = y_true.outputs.values
    y_pred = y_pred.outputs.normalized_values()
    n_correct = sum([y_p == y_t for y_p, y_t in zip(y_pred, y_true)])

    return EvaluationScore(n_correct / len(y_true))

Nothing special here - we simply count the number of correct labels and return an EvaluationScore object.

To try out different prompts, we need to describe the space of possible candidates. SAMMO does that by offering a number of operators, such as one_of.

from sammo.search import EnumerativeSearch
from sammo.search_op import one_of
from sammo.base import Template
from sammo.components import Output, GenerateText


def labeling_prompt_space():
    instructions = one_of(
        [
            "Does the reply mean yes or no?",
            "Does Speaker 2's answer mean yes or no? "
        ],
        reference_id="instr",
    )
    prompt = GenerateText(
        Template(
            "Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
            instructions=instructions,
        )
    )
    return Output(prompt)

With the search space defined, we can now kick off the search:

sample = mydata.sample(25, seed=42)
searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
y_pred = searcher.fit_transform(sample)
searcher.show_report()
candidate[###################################]2/2[00:00<00:00] >> minibatches (total)[#######################]50/50[00:00<00:00]

Fitting log (2 entries):
iteration    action                                            objective    costs                          parse_errors
-----------  ------------------------------------------------  -----------  -----------------------------  --------------
0            {'instr': "'Does the reply mean yes or no?'"}     0.68         {'input': 1348, 'output': 25}  0.0
1            {'instr': '"Does Speaker 2\'s answer mean yes or  0.8          {'input': 1448, 'output': 25}  0.0
             no? "'}

Okay, we are doing a bit better! Let’s see if changing the temperature would impact the result.

def labeling_prompt_space():
    instructions = one_of(
        [
            "Does Speakers 2 answer mean yes or no to Speaker 1?",
            "Does Speaker 2's answer mean yes or no? "
        ],
        reference_id="instr",
    )
    prompt = GenerateText(
        Template(
            "Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
            instructions=instructions,
        ),
        randomness=one_of([0.7, 1.0], reference_id="randomness"),
    )
    return Output(prompt)


searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
searcher.fit(sample)
searcher.show_report()
candidate[###################################]4/4[00:00<00:00] >> minibatches (total)[#####################]100/100[00:00<00:00]

Fitting log (4 entries):
iteration    action                                             objective    costs                          parse_errors
-----------  -------------------------------------------------  -----------  -----------------------------  --------------
0            {'instr': "'Does Speakers 2 answer mean yes or no  0.64         {'input': 1498, 'output': 25}  0.0
             to Speaker 1?'", 'randomness': 0.7}
1            {'instr': "'Does Speakers 2 answer mean yes or no  0.76         {'input': 1498, 'output': 25}  0.0
             to Speaker 1?'", 'randomness': 1.0}
2            {'instr': '"Does Speaker 2\'s answer mean yes or   0.8          {'input': 1448, 'output': 25}  0.0
             no? "', 'randomness': 0.7}
3            {'instr': '"Does Speaker 2\'s answer mean yes or   0.64         {'input': 1448, 'output': 25}  0.0
             no? "', 'randomness': 1.0}

Not bad! With SAMMO, we can quickly try out several alternatives if we want to manually tinker with different prompts.

However, SAMMO offers a much more powerful way of automatically optimizing prompts which we cover in the section after the next one.