tasks.py
This contains all the configuration dataclasses needed to configure AzureML pipelines.
from dataclasses import dataclass
from omegaconf import MISSING
from typing import Any, Optional
@dataclass
class data_input_spec:
# NOTE: Union is not supported in Hydra/OmegaConf
# specify either by dataset name and version
name: Optional[str] = None
version: Optional[str] = None
# or by uuid (non-registered)
uuid: Optional[str] = None
# or by datastore+path
datastore: Optional[str] = None
path: Optional[str] = None
validate: bool = True
@dataclass
class inferencing_task:
data: data_input_spec = MISSING
model: data_input_spec = MISSING
task_key: Optional[str] = None
predict_disable_shape_check: bool = False
@dataclass
class inferencing_variants:
framework: str = MISSING
build: Optional[str] = None
@dataclass
class data_generation_task:
task: str = MISSING
task_key: Optional[str] = None
train_samples: int = MISSING
train_partitions: int = 1
test_samples: int = MISSING
test_partitions: int = 1
inferencing_samples: int = MISSING
inferencing_partitions: int = 1
n_features: int = MISSING
n_informative: Optional[int] = None
n_label_classes: Optional[int] = None
docs_per_query: Optional[int] = None
delimiter: str = "comma"
header: bool = False
@dataclass
class training_task:
train: data_input_spec = MISSING
test: data_input_spec = MISSING
# provide a key for internal tagging + reporting
task_key: Optional[str] = None
@dataclass
class sweep_early_termination_settings:
policy_type: str = 'default' # truncation_selection | median_stopping | bandit
evaluation_interval: Optional[int] = None
delay_evaluation: Optional[int] = None
# truncation settings
truncation_percentage: Optional[int] = None # for truncation_selection
# bandit settings
slack_factor: Optional[float] = None
@dataclass
class sweep_limits_settings:
max_total_trials: int = MISSING
max_concurrent_trials: Optional[int] = None # must be between 1 and 100
timeout_minutes: Optional[int] = None
@dataclass
class sweep_settings:
# TODO: add all parameters from shrike https://github.com/Azure/shrike/blob/387fadb47d69e46bd7e5ac6f243250dc6044afaa/shrike/pipeline/pipeline_helper.py#L809
# goal settings
primary_metric: Optional[str] = None
goal: Optional[str] = None
algorithm: str = "random"
early_termination: Optional[sweep_early_termination_settings] = None
limits: Optional[sweep_limits_settings] = None
@dataclass
class lightgbm_training_variant_parameters:
# fixed training parameters
objective: str = MISSING
metric: str = MISSING
boosting: str = MISSING
tree_learner: str = MISSING
# sweepable training parameters
# NOTE: need to be str so they can be parsed (ex: 'choice(100,200)')
num_iterations: str = MISSING
num_leaves: str = MISSING
min_data_in_leaf: str = MISSING
learning_rate: str = MISSING
max_bin: str = MISSING
feature_fraction: str = MISSING
label_gain: Optional[str] = None
custom_params: Optional[Any] = None
# COMPUTE
device_type: str = "cpu"
multinode_driver: str = "socket"
verbose: bool = False
@dataclass
class lightgbm_training_data_variant_parameters:
# FILE OPTIONS
auto_partitioning: bool = True
pre_convert_to_binary: bool = False # doesn't work with partitioned data (yet)
# input parameters
header: bool = False
label_column: Optional[str] = "0"
group_column: Optional[str] = None
construct: bool = True
# data formats
train_data_format: Optional[str] = None
test_data_format: Optional[str] = None
@dataclass
class lightgbm_training_environment_variant_parameters:
# COMPUTE
nodes: int = 1
processes: int = 1
target: Optional[str] = None
build: Optional[str] = None
@dataclass
class lightgbm_training_output_variant_parameters:
register_model: bool = False # "{register_model_prefix}-{task_key}-{num_iterations}trees-{num_leaves}leaves-{register_model_suffix}"
register_model_prefix: Optional[str] = None
register_model_suffix: Optional[str] = None
@dataclass
class training_variant:
# below are mandatory sections of the variant config
framework: str = MISSING
data: lightgbm_training_data_variant_parameters = MISSING
training: lightgbm_training_variant_parameters = MISSING
runtime: lightgbm_training_environment_variant_parameters = MISSING
# below are optional
raytune: Optional[Any] = None
sweep: Optional[sweep_settings] = None
output:Optional[lightgbm_training_output_variant_parameters] = None