Source code for archai.trainers.nlp.nvidia_training_args

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT licen

import os
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, Optional, Tuple

import numpy as np
import torch

from archai.common.distributed_utils import (
    get_world_size,
    init_distributed,
    sync_workers,
)
from archai.common.file_utils import get_full_path


def _get_amlt_dirs() -> Tuple[str, str]:
    data_dir = os.environ.get("AMLT_DATA_DIR", "")
    output_dir = os.environ.get("AMLT_OUTPUT_DIR", "")

    return data_dir, output_dir


def _get_default_dataroot() -> str:
    is_amlt_available = os.environ.get("AMLT_OUTPUT_DIR", None)

    return "/var/tmp/dataroot" if is_amlt_available else "~/dataroot"


def _create_dirs(
    dataroot: str,
    dataset_name: str,
    experiment_name: Optional[str] = "tmp",
    output_dir: Optional[str] = "~/logdir",
    pretrained_path: Optional[str] = "",
    cache_dir: Optional[str] = "",
) -> Tuple[str, str, str, str]:
    def _get_dataset_dir_name(dataset_name: str) -> str:
        if dataset_name == "wt2":
            return "wikitext-2"
        if dataset_name == "wt103":
            return "wikitext-103"
        if dataset_name == "lm1b":
            return "one-billion-words"
        if dataset_name.startswith("olx_"):
            return dataset_name

        raise RuntimeError(f"Dataset: {dataset_name} is not supported yet.")

    pt_data_dir, pt_output_dir = _get_amlt_dirs()
    if pt_output_dir:
        pt_output_dir = os.path.join(pt_output_dir, experiment_name)

    dataroot = dataroot or pt_data_dir or _get_default_dataroot()
    dataroot = get_full_path(dataroot)

    dataset_dir = get_full_path(os.path.join(dataroot, "textpred", _get_dataset_dir_name(dataset_name)))
    output_dir = get_full_path(pt_output_dir or os.path.join(output_dir, experiment_name), create_folder=True)

    if not os.path.isabs(cache_dir):
        cache_dir = os.path.join(dataset_dir, cache_dir)
    cache_dir = get_full_path(cache_dir, create_folder=True)

    if not os.path.isabs(pretrained_path) and pretrained_path:
        pretrained_path = os.path.join(os.path.dirname(output_dir), pretrained_path)

    return dataset_dir, output_dir, pretrained_path, cache_dir


[docs]@dataclass class NvidiaTrainingArguments: """Define arguments used in the NVIDIA training pipeline. Args: experiment_name: Name of the experiment. checkpoint_file_path: Path to the checkpoint file. output_dir: Output folder. seed: Random seed. no_cuda: Whether CUDA should not be used. logging_steps: Number of steps between logs. do_eval: Whether to enable evaluation. eval_steps: Number of steps between evaluations. save_all_checkpoints: Whether to save all checkpoints from `eval_steps` steps. dataset_name: Name of the dataset. dataset_dir: Dataset folder. dataset_cache_dir: Dataset cache folder. dataset_refresh_cache: Whether cache should be refreshed. vocab_type: Name of the vocabulary/tokenizer. vocab_size: Size of the vocabulary. iterator_roll: Whether iterator should be rolled. global_batch_size: Global batch size. per_device_global_batch_size: Individual GPU batch size. seq_len: Sequence length. strategy: Distributed training strategy. local_rank: Local rank of process. find_unused_parameters: Whether unused parameters should be found. max_steps: Maximum number of training steps. gradient_accumulation_steps: Number of gradient accumulation steps. fp16: Whether FP16 precision should be used. optim: Name of the optimizer. learning_rate: Optimizer learning rate. weight_decay: Optimizer weight decay. momentum: Optimizer momentum. max_grad_norm: Optimizer gradients clipping value. lr_scheduler_type: Name of the scheduler. lr_qat_scheduler_type: Name of the QAT-based scheduler. lr_scheduler_max_steps: Maximum number of scheduler steps. lr_scheduler_warmup_steps: Number of warmup steps for the scheduler. lr_scheduler_patience: Scheduler patience. lr_scheduler_min_lr: Scheduler minimum learning rate. lr_scheduler_decay_rate: Scheduler decay rate. qat: Whether QAT should be used during training. mixed_qat: Whether MixedQAT should be used during training. """ experiment_name: str = field(metadata={"help": "Name of the experiment."}) checkpoint_file_path: str = field(default="", metadata={"help": "Path to the checkpoint file."}) output_dir: str = field(default="~/logdir", metadata={"help": "Output folder."}) seed: int = field(default=42, metadata={"help": "Random seed."}) no_cuda: bool = field(default=False, metadata={"help": "Whether CUDA should not be used."}) logging_steps: int = field(default=10, metadata={"help": "Number of steps between logs."}) do_eval: bool = field(default=True, metadata={"help": "Whether to enable evaluation."}) eval_steps: int = field(default=100, metadata={"help": "Number of steps between evaluations."}) save_all_checkpoints: bool = field( default=False, metadata={"help": "Whether to save all checkpoints from `eval_steps` steps."} ) dataset_name: str = field(default="wt103", metadata={"help": "Name of the dataset."}) dataset_dir: str = field(default="", metadata={"help": "Dataset folder."}) dataset_cache_dir: str = field(default="cache", metadata={"help": "Dataset cache folder."}) dataset_refresh_cache: bool = field(default=False, metadata={"help": "Whether cache should be refreshed."}) vocab_type: str = field(default="gpt2", metadata={"help": "Type of the tokenizer."}) vocab_size: int = field(default=10000, metadata={"help": "Size of the vocabulary"}) iterator_roll: bool = field(default=True, metadata={"help": "Whether iterator should be rolled."}) global_batch_size: int = field(default=256, metadata={"help": "Global batch size."}) per_device_global_batch_size: int = field(default=None, metadata={"help": "Individual GPU batch size."}) seq_len: int = field(default=192, metadata={"help": "Sequence length."}) strategy: str = field(default="ddp", metadata={"help": "Multi-GPU strategy."}) local_rank: int = field(default=os.getenv("LOCAL_RANK", 0), metadata={"help": "Local rank of process."}) find_unused_parameters: bool = field(default=False, metadata={"help": "Whether unused parameters should be found."}) max_steps: int = field(default=40000, metadata={"help": "Maximum number of training steps."}) gradient_accumulation_steps: int = field(default=1, metadata={"help": "Number of gradient accumulation steps."}) fp16: bool = field(default=False, metadata={"help": "Whether FP16 precision should be used."}) optim: str = field(default="jitlamb", metadata={"help": "Name of the optimizer."}) learning_rate: float = field(default=0.01, metadata={"help": "Optimizer learning rate."}) weight_decay: float = field(default=0.0, metadata={"help": "Optimizer weight decay."}) momentum: float = field(default=0.0, metadata={"help": "Optimizer momentum."}) max_grad_norm: float = field(default=0.25, metadata={"help": "Optimizer gradients clipping value."}) lr_scheduler_type: str = field(default="cosine", metadata={"help": "Name of the scheduler."}) lr_qat_scheduler_type: str = field(default="cosine", metadata={"help": "Name of the QAT-based scheduler."}) lr_scheduler_max_steps: int = field(default=None, metadata={"help": "Maximum number of scheduler steps."}) lr_scheduler_warmup_steps: int = field(default=1000, metadata={"help": "Number of scheduler warmup steps."}) lr_scheduler_patience: float = field(default=0, metadata={"help": "Scheduler patience."}) lr_scheduler_min_lr: float = field(default=0.001, metadata={"help": "Scheduler minimum learning rate."}) lr_scheduler_decay_rate: float = field(default=0.5, metadata={"help": "Scheduler decay rate."}) qat: bool = field(default=False, metadata={"help": "Whether QAT should be used during training."}) mixed_qat: bool = field(default=False, metadata={"help": "Whether MixedQAT should be used during training."}) @property def device(self) -> torch.device: """Return a PyTorch device instance.""" return torch.device("cuda" if not self.no_cuda else "cpu") def __post_init__(self) -> None: """Override post-initialization with custom instructions. Ensure that `qat` and `mixed_qat` are not used together, set the random seed, initialize distributed training, create necessary directories, and set the global batch size. """ assert not (self.qat and self.mixed_qat), "`qat` and `mixed_qat` should not be used together." np.random.seed(self.seed) torch.manual_seed(self.seed) self.local_rank = int(self.local_rank) if not self.no_cuda: torch.cuda.set_device(self.local_rank) init_distributed(True) (self.dataset_dir, self.output_dir, self.checkpoint_file_path, self.dataset_cache_dir,) = _create_dirs( self.dataset_dir, self.dataset_name, self.experiment_name, self.output_dir, self.checkpoint_file_path, self.dataset_cache_dir, ) with sync_workers() as rank: if rank == 0: os.makedirs(self.output_dir, exist_ok=True) if self.per_device_global_batch_size is not None: world_size = get_world_size() self.global_batch_size = world_size * self.per_device_global_batch_size
[docs] def to_dict(self) -> Dict[str, Any]: """Convert attributes into a dictionary representation. Returns: Attributes encoded as a dictionary. """ return asdict(self)