Tune - HuggingFace
This example uses flaml to finetune a transformer model from Huggingface transformers library.
Note: flaml.AutoML
has built-in support for certain finetuning tasks with a
higher-level API.
It may be easier to use that API unless you have special requirements not handled by that API.
Requirements
This example requires GPU. Install dependencies:
pip install torch transformers datasets "flaml[blendsearch,ray]"
Prepare for tuning
Tokenizer
from transformers import AutoTokenizer
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
COLUMN_NAME = "sentence"
def tokenize(examples):
return tokenizer(examples[COLUMN_NAME], truncation=True)
Define training method
import flaml
import datasets
from transformers import AutoModelForSequenceClassification
TASK = "cola"
NUM_LABELS = 2
def train_distilbert(config: dict):
# Load CoLA dataset and apply tokenizer
cola_raw = datasets.load_dataset("glue", TASK)
cola_encoded = cola_raw.map(tokenize, batched=True)
train_dataset, eval_dataset = cola_encoded["train"], cola_encoded["validation"]
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, num_labels=NUM_LABELS
)
metric = datasets.load_metric("glue", TASK)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir=".",
do_eval=False,
disable_tqdm=True,
logging_steps=20000,
save_total_limit=0,
**config,
)
trainer = Trainer(
model,
training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# train model
trainer.train()
# evaluate model
eval_output = trainer.evaluate()
# report the metric to optimize & the metric to log
flaml.tune.report(
loss=eval_output["eval_loss"],
matthews_correlation=eval_output["eval_matthews_correlation"],
)
Define the search
We are now ready to define our search. This includes:
- The
search_space
for our hyperparameters - The
metric
and themode
('max' or 'min') for optimization - The constraints (
n_cpus
,n_gpus
,num_samples
, andtime_budget_s
)
max_num_epoch = 64
search_space = {
# You can mix constants with search space objects.
"num_train_epochs": flaml.tune.loguniform(1, max_num_epoch),
"learning_rate": flaml.tune.loguniform(1e-6, 1e-4),
"adam_epsilon": flaml.tune.loguniform(1e-9, 1e-7),
"adam_beta1": flaml.tune.uniform(0.8, 0.99),
"adam_beta2": flaml.tune.loguniform(98e-2, 9999e-4),
}
# optimization objective
HP_METRIC, MODE = "matthews_correlation", "max"
# resources
num_cpus = 4
num_gpus = 4 # change according to your GPU resources
# constraints
num_samples = -1 # number of trials, -1 means unlimited
time_budget_s = 3600 # time budget in seconds
Launch the tuning
We are now ready to launch the tuning using flaml.tune.run
:
import ray
ray.init(num_cpus=num_cpus, num_gpus=num_gpus)
print("Tuning started...")
analysis = flaml.tune.run(
train_distilbert,
search_alg=flaml.CFO(
space=search_space,
metric=HP_METRIC,
mode=MODE,
low_cost_partial_config={"num_train_epochs": 1},
),
resources_per_trial={"gpu": num_gpus, "cpu": num_cpus},
local_dir="logs/",
num_samples=num_samples,
time_budget_s=time_budget_s,
use_ray=True,
)
This will run tuning for one hour. At the end we will see a summary.
== Status ==
Memory usage on this node: 32.0/251.6 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)
Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58
Number of trials: 22/infinite (22 TERMINATED)
Trial name status loc adam_beta1 adam_beta2 adam_epsilon learning_rate num_train_epochs iter total time (s) loss matthews_correlation
train_distilbert_a0c303d0 TERMINATED 0.939079 0.991865 7.96945e-08 5.61152e-06 1 1 55.6909 0.587986 0
train_distilbert_a0c303d1 TERMINATED 0.811036 0.997214 2.05111e-09 2.05134e-06 1.44427 1 71.7663 0.603018 0
train_distilbert_c39b2ef0 TERMINATED 0.909395 0.993715 1e-07 5.26543e-06 1 1 53.7619 0.586518 0
train_distilbert_f00776e2 TERMINATED 0.968763 0.990019 4.38943e-08 5.98035e-06 1.02723 1 56.8382 0.581313 0
train_distilbert_11ab3900 TERMINATED 0.962198 0.991838 7.09296e-08 5.06608e-06 1 1 54.0231 0.585576 0
train_distilbert_353025b6 TERMINATED 0.91596 0.991892 8.95426e-08 6.21568e-06 2.15443 1 98.3233 0.531632 0.388893
train_distilbert_5728a1de TERMINATED 0.926933 0.993146 1e-07 1.00902e-05 1 1 55.3726 0.538505 0.280558
train_distilbert_9394c2e2 TERMINATED 0.928106 0.990614 4.49975e-08 3.45674e-06 2.72935 1 121.388 0.539177 0.327295
train_distilbert_b6543fec TERMINATED 0.876896 0.992098 1e-07 7.01176e-06 1.59538 1 76.0244 0.527516 0.379177
train_distilbert_0071f998 TERMINATED 0.955024 0.991687 7.39776e-08 5.50998e-06 2.90939 1 126.871 0.516225 0.417157
train_distilbert_2f830be6 TERMINATED 0.886931 0.989628 7.6127e-08 4.37646e-06 1.53338 1 73.8934 0.551629 0.0655887
train_distilbert_7ce03f12 TERMINATED 0.984053 0.993956 8.70144e-08 7.82557e-06 4.08775 1 174.027 0.523732 0.453549
train_distilbert_aaab0508 TERMINATED 0.940707 0.993946 1e-07 8.91979e-06 3.40243 1 146.249 0.511288 0.45085
train_distilbert_14262454 TERMINATED 0.99 0.991696 4.60093e-08 4.83405e-06 3.4954 1 152.008 0.53506 0.400851
train_distilbert_6d211fe6 TERMINATED 0.959277 0.994556 5.40791e-08 1.17333e-05 6.64995 1 271.444 0.609851 0.526802
train_distilbert_c980bae4 TERMINATED 0.99 0.993355 1e-07 5.21929e-06 2.51275 1 111.799 0.542276 0.324968
train_distilbert_6d0d29d6 TERMINATED 0.965773 0.995182 9.9752e-08 1.15549e-05 13.694 1 527.944 0.923802 0.549474
train_distilbert_b16ea82a TERMINATED 0.952781 0.993931 2.93182e-08 1.19145e-05 3.2293 1 139.844 0.533466 0.451307
train_distilbert_eddf7cc0 TERMINATED 0.99 0.997109 8.13498e-08 1.28515e-05 15.5807 1 614.789 0.983285 0.56993
train_distilbert_43008974 TERMINATED 0.929089 0.993258 1e-07 1.03892e-05 12.0357 1 474.387 0.857461 0.520022
train_distilbert_b3408a4e TERMINATED 0.99 0.993809 4.67441e-08 1.10418e-05 11.9165 1 474.126 0.828205 0.526164
train_distilbert_cfbfb220 TERMINATED 0.979454 0.9999 1e-07 1.49578e-05 20.3715
Retrieve the results
best_trial = analysis.get_best_trial(HP_METRIC, MODE, "all")
metric = best_trial.metric_analysis[HP_METRIC][MODE]
print(f"n_trials={len(analysis.trials)}")
print(f"time={time.time()-start_time}")
print(f"Best model eval {HP_METRIC}: {metric:.4f}")
print(f"Best model parameters: {best_trial.config}")
# n_trials=22
# time=3999.769361972809
# Best model eval matthews_correlation: 0.5699
# Best model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}