Natural Language Processing

Sections

Natural Language Processing#

DeepSpeed#

Trainer#

class archai.trainers.nlp.ds_trainer.StatefulDistributedSampler(dataset: Dataset, num_replicas: int | None = None, rank: int | None = None, shuffle: bool | None = True, seed: int | None = 0, drop_last: bool | None = False, total_consumed_samples: int | None = 0)[source]#: Distributed sampler that supports resuming from a given step.

DeepSpeed trainer.

property data_parallel_world_size: int#: Return the data parallel world size.

property data_parallel_rank: int#: Return the data parallel rank of the current process.

train_batch_without_pipe_parallel(data_iter: Iterator | None = None) → Tensor[source]#

Train a batch without pipeline parallelism.

Parameters:: data_iter – Data iterator.
Returns:: Loss tensor.

eval_batch_without_pipe_parallel(data_iter: Iterator | None = None) → Tensor[source]#

Evaluate a batch without pipeline parallelism.

Parameters:: data_iter – Data iterator.
Returns:: Loss tensor.

train(resume_from_checkpoint: str | None = None, resume_optimizer_state: bool | None = True, resume_lr_scheduler_state: bool | None = True) → None[source]#

Train a model.

Parameters:

resume_from_checkpoint – Path to checkpoint to resume training from.
resume_optimizer_state – Whether to resume optimizer state.
resume_lr_scheduler_state – Whether to resume learning rate scheduler state.

evaluate(eval_dataset: Dataset) → Tuple[float, float, float, float][source]#

Evaluate a model.

Parameters:: eval_dataset – Evaluation dataset.
Returns:: Evaluation loss, time, samples per second and steps per second.

predict() → None[source]#: Predict with a model.

Training Arguments#

class archai.trainers.nlp.ds_training_args.DsTrainingArguments(output_dir: str, ds_config: dict | str = <factory>, do_eval: bool = True, max_steps: int = 1, logging_steps: int = 10, save_steps: int = 500, seed: int = 42, local_rank: int = -1, backend: int = 'nccl', eval_steps: int = 500, eval_max_steps: int | None = None, pipe_parallel_size: int = 1, pipe_parallel_loss_fn: callable | None = None, pipe_parallel_partition_method: str = 'parameters', pipe_parallel_activation_checkpoint_steps: int = 0, dataloader_pin_memory: bool = True, dataloader_num_workers: int = 0)[source]#

Define arguments used in the DeepSpeed training pipeline.

Parameters:

output_dir – Output folder.
ds_config – DeepSpeed configuration (dictionary or path to JSON file).
do_eval – Whether to enable evaluation.
max_steps – Maximum number of training steps.
logging_steps – Number of steps between logs.
save_steps – Number of steps between checkpoints.
seed – Random seed.
local_rank – Rank of process.
backend – Distributed training backend.
eval_steps – Number of steps between evaluations.
pipe_parallel – Whether to use pipeline parallelism.
pipe_parallel_size – Size of pipeline parallelism.
pipe_parallel_loss_fn – Loss function for pipeline parallelism.
pipe_parallel_partition_method – Partition method for pipeline parallelism.
pipe_parallel_activation_checkpoint_steps – Number of steps between pipeline parallelism activation checkpoins.

output_dir: str#

ds_config: dict | str#

do_eval: bool = True#

max_steps: int = 1#

logging_steps: int = 10#

save_steps: int = 500#

seed: int = 42#

local_rank: int = -1#

backend: int = 'nccl'#

eval_steps: int = 500#

eval_max_steps: int = None#

pipe_parallel_size: int = 1#

pipe_parallel_loss_fn: callable = None#

pipe_parallel_partition_method: str = 'parameters'#

pipe_parallel_activation_checkpoint_steps: int = 0#

dataloader_pin_memory: bool = True#

dataloader_num_workers: int = 0#

to_dict() → Dict[str, Any][source]#

Convert attributes into a dictionary representation.

Returns:: Attributes encoded as a dictionary.

Hugging Face#

Callbacks#

class archai.trainers.nlp.hf_callbacks.BPCTrainerCallback(*args, **kwargs)[source]#

A TrainerCallback that adds bits per character metrics to the logs.

on_log(args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs) → None[source]#

Add bits per character metrics to the training logs.

Parameters:

args – The training arguments.
state – The trainer state.
control – The trainer control.

on_evaluate(args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: Dict[str, float] | None = None, **kwargs) → None[source]#

Add bits per character metrics to the evaluation metrics.

Parameters:

args – The training arguments.
state – The trainer state.
control – The trainer control.
metrics – The evaluation metrics.

class archai.trainers.nlp.hf_callbacks.PerplexityTrainerCallback(*args, **kwargs)[source]#

A TrainerCallback that adds perplexity metrics to the logs.

on_log(args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs) → None[source]#

Add perplexity metrics to the training logs.

Parameters:

args – The training arguments.
state – The trainer state.
control – The trainer control.

on_evaluate(args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: Dict[str, float] | None = None, **kwargs) → None[source]#

Add perplexity metrics to the evaluation metrics.

Parameters:

args – The training arguments.
state – The trainer state.
control – The trainer control.
metrics – The evaluation metrics.

Trainer#

class archai.trainers.nlp.hf_trainer.HfTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]#: Hugging Face trainer.

class archai.trainers.nlp.hf_trainer.HfDistillerTrainer(teacher_model: Module, **kwargs)[source]#

Hugging Face distillation-based trainer.

compute_loss(model: Module, inputs: Dict[str, Tensor], return_outputs: bool | None = False) → Tuple[Tensor, ...][source]#

Override the computation of the loss function.

The loss is a weighted sum of the student’s loss, as computed by the original HfTrainer, and the KL divergence between the student and teacher models.

Parameters:

model – Student model.
inputs – Input tensors.
return_outputs – Whether outputs should be returned.

Returns:

(loss, outputs) or the loss tensor.

Training Arguments#

class archai.trainers.nlp.hf_training_args.DistillerTrainingArguments(output_dir: str, overwrite_output_dir: bool = False, do_train: bool = False, do_eval: bool = False, do_predict: bool = False, evaluation_strategy: IntervalStrategy | str = 'no', prediction_loss_only: bool = False, per_device_train_batch_size: int = 8, per_device_eval_batch_size: int = 8, per_gpu_train_batch_size: int | None = None, per_gpu_eval_batch_size: int | None = None, gradient_accumulation_steps: int = 1, eval_accumulation_steps: int | None = None, eval_delay: float | None = 0, learning_rate: float = 5e-05, weight_decay: float = 0.0, adam_beta1: float = 0.9, adam_beta2: float = 0.999, adam_epsilon: float = 1e-08, max_grad_norm: float = 1.0, num_train_epochs: float = 3.0, max_steps: int = -1, lr_scheduler_type: SchedulerType | str = 'linear', warmup_ratio: float = 0.0, warmup_steps: int = 0, log_level: str | None = 'passive', log_level_replica: str | None = 'warning', log_on_each_node: bool = True, logging_dir: str | None = None, logging_strategy: IntervalStrategy | str = 'steps', logging_first_step: bool = False, logging_steps: int = 500, logging_nan_inf_filter: bool = True, save_strategy: IntervalStrategy | str = 'steps', save_steps: int = 500, save_total_limit: int | None = None, save_on_each_node: bool = False, no_cuda: bool = False, use_mps_device: bool = False, seed: int = 42, data_seed: int | None = None, jit_mode_eval: bool = False, use_ipex: bool = False, bf16: bool = False, fp16: bool = False, fp16_opt_level: str = 'O1', half_precision_backend: str = 'auto', bf16_full_eval: bool = False, fp16_full_eval: bool = False, tf32: bool | None = None, local_rank: int = -1, xpu_backend: str | None = None, tpu_num_cores: int | None = None, tpu_metrics_debug: bool = False, debug: str = '', dataloader_drop_last: bool = False, eval_steps: int | None = None, dataloader_num_workers: int = 0, past_index: int = -1, run_name: str | None = None, disable_tqdm: bool | None = None, remove_unused_columns: bool | None = True, label_names: List[str] | None = None, load_best_model_at_end: bool | None = False, metric_for_best_model: str | None = None, greater_is_better: bool | None = None, ignore_data_skip: bool = False, sharded_ddp: str = '', fsdp: str = '', fsdp_min_num_params: int = 0, fsdp_config: str | None = None, fsdp_transformer_layer_cls_to_wrap: str | None = None, deepspeed: str | None = None, label_smoothing_factor: float = 0.0, optim: OptimizerNames | str = 'adamw_hf', optim_args: str | None = None, adafactor: bool = False, group_by_length: bool = False, length_column_name: str | None = 'length', report_to: List[str] | None = None, ddp_find_unused_parameters: bool | None = None, ddp_bucket_cap_mb: int | None = None, dataloader_pin_memory: bool = True, skip_memory_metrics: bool = True, use_legacy_prediction_loop: bool = False, push_to_hub: bool = False, resume_from_checkpoint: str | None = None, hub_model_id: str | None = None, hub_strategy: HubStrategy | str = 'every_save', hub_token: str | None = None, hub_private_repo: bool = False, gradient_checkpointing: bool = False, include_inputs_for_metrics: bool = False, fp16_backend: str = 'auto', push_to_hub_model_id: str | None = None, push_to_hub_organization: str | None = None, push_to_hub_token: str | None = None, mp_parameters: str = '', auto_find_batch_size: bool = False, full_determinism: bool = False, torchdynamo: str | None = None, ray_scope: str | None = 'last', ddp_timeout: int | None = 1800, torch_compile: bool = False, torch_compile_backend: str | None = None, torch_compile_mode: str | None = None, alpha: float = 0.5, temperature: float = 1.0)[source]#

Training arguments for distillation-based training.

This class extends TrainingArguments and provides additional arguments specific to distillation-based training.

Parameters:

alpha – Weight ratio between the student and KD losses. This should be a value in the range [0, 1].
temperature – Annealing ratio for the softmax activations. This value should be greater than 0.

alpha: float = 0.5#

temperature: float = 1.0#

NVIDIA#

Trainer#

archai.trainers.nlp.nvidia_trainer.save_checkpoint(output_dir: str, model: Module, optimizer: Optimizer, scheduler: _LRScheduler, scaler: GradScaler, trainer_state: Dict[str, Any], fp16: bool, prefix: str | None = '', save_all_checkpoints: bool | None = False, is_best_model: bool | None = False) → None[source]#

Save a checkpoint that holds enough information to resume the training.

The checkpoint contains the model’s configuration and state, the optimizer’s state, the scheduler’s state, the scaler’s state (if FP16 precision is used), and the trainer’s state.

If is_best_model is True, the function will also save a copy of the checkpoint with the prefix “checkpoint-best”.

If save_all_checkpoints is True, the function will also save a copy of the checkpoint with the step number in the file name.

Parameters:

output_dir – Folder where checkpoint should be saved.
model – Instance of model.
optimizer – Instance of optimizer.
scheduler – Instance of scheduler.
scaler – Instance of scaler.
trainer_state – Current trainer state.
fp16 – Whether fp16 precision is used or not.
prefix – Prefix which should be added to the checkpoint’s file name.
save_all_checkpoints – Whether all eval_steps steps should be saved.
is_best_model – Whether best model should be saved.

class archai.trainers.nlp.nvidia_trainer.NvidiaTrainer(model: Module, args: NvidiaTrainingArguments | None = None)[source]#

NVIDIA-based trainer.

load_checkpoint(checkpoint_file_path: str) → Tuple[int, int, int, int][source]#

Load states from a checkpoint file.

Parameters:: checkpoint_file_path – Path to the checkpoint file.
Returns:: Current iterator, epoch, batch, and step values.

train(checkpoint_file_path: str | None = '') → Dict[str, Any][source]#

Train a model.

Parameters:: checkpoint_file_path – Path to the checkpoint that will be used to resume the training.
Returns:: Training-related metrics.

evaluate(eval_dataloader: Iterator | None = None) → Dict[str, Any][source]#

Evaluate a model.

Parameters:: eval_dataloader – Evaluation-based data loader. If not supplied, it will default to the one available in pre-loaded dataset.
Returns:: Evaluation-related metrics.

predict() → None[source]#: Predict with a model.

fine_tune_qat(model: Module | None = None, checkpoint_file_path: str | None = '') → None[source]#

Fine-tune a model with QAT.

Users are allowed to pass in a different model (e.g., without dropout) than the one instantiated with NvidiaTrainer, as well as a pre-trained checkpoint file to load the weights from a previous training.

Parameters:

model – Model to be fine-tuned.
checkpoint_file_path – Path to the checkpoint used to resume training.

Training Arguments#

class archai.trainers.nlp.nvidia_training_args.NvidiaTrainingArguments(experiment_name: str, checkpoint_file_path: str = '', output_dir: str = '~/logdir', seed: int = 42, no_cuda: bool = False, logging_steps: int = 10, do_eval: bool = True, eval_steps: int = 100, save_all_checkpoints: bool = False, dataset_name: str = 'wt103', dataset_dir: str = '', dataset_cache_dir: str = 'cache', dataset_refresh_cache: bool = False, vocab_type: str = 'gpt2', vocab_size: int = 10000, iterator_roll: bool = True, global_batch_size: int = 256, per_device_global_batch_size: int | None = None, seq_len: int = 192, strategy: str = 'ddp', local_rank: int = 0, find_unused_parameters: bool = False, max_steps: int = 40000, gradient_accumulation_steps: int = 1, fp16: bool = False, optim: str = 'jitlamb', learning_rate: float = 0.01, weight_decay: float = 0.0, momentum: float = 0.0, max_grad_norm: float = 0.25, lr_scheduler_type: str = 'cosine', lr_qat_scheduler_type: str = 'cosine', lr_scheduler_max_steps: int | None = None, lr_scheduler_warmup_steps: int = 1000, lr_scheduler_patience: float = 0, lr_scheduler_min_lr: float = 0.001, lr_scheduler_decay_rate: float = 0.5, qat: bool = False, mixed_qat: bool = False)[source]#

Define arguments used in the NVIDIA training pipeline.

Parameters:

experiment_name – Name of the experiment.
checkpoint_file_path – Path to the checkpoint file.
output_dir – Output folder.
seed – Random seed.
no_cuda – Whether CUDA should not be used.
logging_steps – Number of steps between logs.
do_eval – Whether to enable evaluation.
eval_steps – Number of steps between evaluations.
save_all_checkpoints – Whether to save all checkpoints from eval_steps steps.
dataset_name – Name of the dataset.
dataset_dir – Dataset folder.
dataset_cache_dir – Dataset cache folder.
dataset_refresh_cache – Whether cache should be refreshed.
vocab_type – Name of the vocabulary/tokenizer.
vocab_size – Size of the vocabulary.
iterator_roll – Whether iterator should be rolled.
global_batch_size – Global batch size.
per_device_global_batch_size – Individual GPU batch size.
seq_len – Sequence length.
strategy – Distributed training strategy.
local_rank – Local rank of process.
find_unused_parameters – Whether unused parameters should be found.
max_steps – Maximum number of training steps.
gradient_accumulation_steps – Number of gradient accumulation steps.
fp16 – Whether FP16 precision should be used.
optim – Name of the optimizer.
learning_rate – Optimizer learning rate.
weight_decay – Optimizer weight decay.
momentum – Optimizer momentum.
max_grad_norm – Optimizer gradients clipping value.
lr_scheduler_type – Name of the scheduler.
lr_qat_scheduler_type – Name of the QAT-based scheduler.
lr_scheduler_max_steps – Maximum number of scheduler steps.
lr_scheduler_warmup_steps – Number of warmup steps for the scheduler.
lr_scheduler_patience – Scheduler patience.
lr_scheduler_min_lr – Scheduler minimum learning rate.
lr_scheduler_decay_rate – Scheduler decay rate.
qat – Whether QAT should be used during training.
mixed_qat – Whether MixedQAT should be used during training.

experiment_name: str#

checkpoint_file_path: str = ''#

output_dir: str = '~/logdir'#

seed: int = 42#

no_cuda: bool = False#

logging_steps: int = 10#

do_eval: bool = True#

eval_steps: int = 100#

save_all_checkpoints: bool = False#

dataset_name: str = 'wt103'#

dataset_dir: str = ''#

dataset_cache_dir: str = 'cache'#

dataset_refresh_cache: bool = False#

vocab_type: str = 'gpt2'#

vocab_size: int = 10000#

iterator_roll: bool = True#

global_batch_size: int = 256#

per_device_global_batch_size: int = None#

seq_len: int = 192#

strategy: str = 'ddp'#

local_rank: int = 0#

find_unused_parameters: bool = False#

max_steps: int = 40000#

gradient_accumulation_steps: int = 1#

fp16: bool = False#

optim: str = 'jitlamb'#

learning_rate: float = 0.01#

weight_decay: float = 0.0#

momentum: float = 0.0#

max_grad_norm: float = 0.25#

lr_scheduler_type: str = 'cosine'#

lr_qat_scheduler_type: str = 'cosine'#

lr_scheduler_max_steps: int = None#

lr_scheduler_warmup_steps: int = 1000#

lr_scheduler_patience: float = 0#

lr_scheduler_min_lr: float = 0.001#

lr_scheduler_decay_rate: float = 0.5#

qat: bool = False#

mixed_qat: bool = False#

property device: device#: Return a PyTorch device instance.

to_dict() → Dict[str, Any][source]#

Convert attributes into a dictionary representation.

Returns:: Attributes encoded as a dictionary.