Source code for archai.datasets.nlp.tokenizer_utils.token_config

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from collections import OrderedDict
from enum import Enum
from typing import List, Optional


def _dedup_list(input_list: List[str]) -> List[str]:
    return list(OrderedDict.fromkeys(input_list))


[docs]class SpecialTokenEnum(Enum): """Enumerate special tokens.""" UNK = 0 BOS = 1 EOS = 2 PAD = 3 MASK = 4
[docs]class TokenConfig: """Store and access configuration options for special tokens, such as BOS, EOS, UNK, and PAD. """ def __init__( self, bos_token: Optional[str] = "<|endoftext|>", eos_token: Optional[str] = "<|endoftext|>", unk_token: Optional[str] = "<|endoftext|>", pad_token: Optional[str] = None, add_prefix_space: Optional[bool] = False, add_prefix_new_line: Optional[bool] = False, lower_case: Optional[bool] = False, ) -> None: """Initialize the `TokenConfig` class by setting the specified attributes. Args: bos_token: Begin-of-sentence token. eos_token: End-of-sentence token. unk_token: Unknown token. pad_token: Padding token. add_prefix_space: Whether a prefix space token should be added. add_prefix_new_line: Whether a prefix new line token should be added. lower_case: Whether lower case should be applied. """ self.bos_token = bos_token self.eos_token = eos_token self.unk_token = unk_token self.pad_token = pad_token self.add_prefix_space = add_prefix_space self.add_prefix_new_line = add_prefix_new_line self.lower_case = lower_case
[docs] def get_special_tokens(self) -> List[str]: """Return a list of all available special tokens. Returns: Special tokens. """ return _dedup_list([stok for stok in (self.unk_token, self.bos_token, self.eos_token, self.pad_token) if stok])
[docs] def special_token_name(self, sp: SpecialTokenEnum) -> str: """Return the name of a special token. Args: sp: Special token enumerator. Returns: Special token name. """ if sp == SpecialTokenEnum.BOS: return self.bos_token if sp == SpecialTokenEnum.EOS: return self.eos_token if sp == SpecialTokenEnum.UNK: return self.unk_token if sp == SpecialTokenEnum.PAD: return self.pad_token return None