8. Basic Prompt Engineering#
SAMMO
has a variety of tools that make trying out different sets of prompts easy. Let’s start by loading the same task as before.
Show code cell content
# %load -r 3:25 _init.py
import pathlib
import sammo
from sammo.runners import OpenAIChat
from sammo.base import Template, EvaluationScore
from sammo.components import Output, GenerateText, ForEach, Union
from sammo.extractors import ExtractRegex
from sammo.data import DataTable
import json
import requests
import os
if not 'OPENAI_API_KEY' in os.environ:
raise ValueError("Please set the environment variable 'OPENAI_API_KEY'.")
_ = sammo.setup_logger("WARNING") # we're only interested in warnings for now
runner = OpenAIChat(
model_id="gpt-3.5-turbo",
api_config={"api_key": os.environ['OPENAI_API_KEY']},
cache=os.getenv("CACHE_FILE", "cache.tsv"),
timeout=30,
)
Show code cell source
import json
import requests
def load_data(
url="https://github.com/google/BIG-bench/raw/main/bigbench/benchmark_tasks/implicatures/task.json",
):
task = json.loads(requests.get(url).content)
# convert label to single string
for x in task["examples"]:
x["output"] = max(x["target_scores"], key=x["target_scores"].get)
return DataTable.from_records(
task["examples"],
input_fields="input",
constants={"instructions": task["task_prefix"]},
)
mydata = load_data()
Let’s say we want to try out different instructions. For that, let’s define an objective.
# %load -s accuracy _init.py
def accuracy(y_true: DataTable, y_pred: DataTable) -> EvaluationScore:
y_true = y_true.outputs.values
y_pred = y_pred.outputs.normalized_values()
n_correct = sum([y_p == y_t for y_p, y_t in zip(y_pred, y_true)])
return EvaluationScore(n_correct / len(y_true))
Nothing special here - we simply count the number of correct labels and return an EvaluationScore
object.
To try out different prompts, we need to describe the space of possible candidates. SAMMO
does that by offering a number of operators, such as one_of
.
from sammo.search import EnumerativeSearch
from sammo.search_op import one_of
from sammo.base import Template
from sammo.components import Output, GenerateText
def labeling_prompt_space():
instructions = one_of(
[
"Does the reply mean yes or no?",
"Does Speaker 2's answer mean yes or no? "
],
reference_id="instr",
)
prompt = GenerateText(
Template(
"Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
instructions=instructions,
)
)
return Output(prompt)
With the search space defined, we can now kick off the search:
sample = mydata.sample(25, seed=42)
searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
y_pred = searcher.fit_transform(sample)
searcher.show_report()
candidate[###################################]2/2[00:00<00:00] >> minibatches (total)[#######################]50/50[00:00<00:00]
Fitting log (2 entries):
iteration action objective costs parse_errors
----------- ------------------------------------------------ ----------- ----------------------------- --------------
0 {'instr': "'Does the reply mean yes or no?'"} 0.68 {'input': 1348, 'output': 25} 0.0
1 {'instr': '"Does Speaker 2\'s answer mean yes or 0.8 {'input': 1448, 'output': 25} 0.0
no? "'}
Okay, we are doing a bit better! Let’s see if changing the temperature would impact the result.
def labeling_prompt_space():
instructions = one_of(
[
"Does Speakers 2 answer mean yes or no to Speaker 1?",
"Does Speaker 2's answer mean yes or no? "
],
reference_id="instr",
)
prompt = GenerateText(
Template(
"Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
instructions=instructions,
),
randomness=one_of([0.7, 1.0], reference_id="randomness"),
)
return Output(prompt)
searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
searcher.fit(sample)
searcher.show_report()
candidate[###################################]4/4[00:00<00:00] >> minibatches (total)[#####################]100/100[00:00<00:00]
Fitting log (4 entries):
iteration action objective costs parse_errors
----------- ------------------------------------------------- ----------- ----------------------------- --------------
0 {'instr': "'Does Speakers 2 answer mean yes or no 0.64 {'input': 1498, 'output': 25} 0.0
to Speaker 1?'", 'randomness': 0.7}
1 {'instr': "'Does Speakers 2 answer mean yes or no 0.76 {'input': 1498, 'output': 25} 0.0
to Speaker 1?'", 'randomness': 1.0}
2 {'instr': '"Does Speaker 2\'s answer mean yes or 0.8 {'input': 1448, 'output': 25} 0.0
no? "', 'randomness': 0.7}
3 {'instr': '"Does Speaker 2\'s answer mean yes or 0.64 {'input': 1448, 'output': 25} 0.0
no? "', 'randomness': 1.0}
Not bad! With SAMMO
, we can quickly try out several alternatives if we want to manually tinker with different prompts.
However, SAMMO
offers a much more powerful way of automatically optimizing prompts which we cover in the section after the next one.