Tokenization Utilities#

Tokenizer (Base Class)#

class archai.datasets.nlp.tokenizer_utils.tokenizer_base.TokenizerBase[source]#

Abstract class for tokenizers.

This class serves as a base for training, encoding and decoding. The class enforces implementation of nine methods: __len__, train, is_trained, load, encode_text, decode_text, special_token_id, token_to_id and id_to_token.

Note

This class is inherited from EnforceOverrides and any overridden methods in the subclass should be decorated with @overrides to ensure they are properly overridden.

abstract train(filepaths: List[str]) None[source]#

Train the tokenizer on a list of files.

Parameters:

filepaths – A list of paths to input files.

abstract is_trained() bool[source]#

Check if the vocabulary has been trained.

Returns:

True if the vocabulary has been trained, False otherwise.

abstract load() None[source]#

Load a pre-trained tokenizer.

abstract encode_text(text: str) List[int][source]#

Encode text into tokens.

Parameters:

text – The input text to encode.

Returns:

The encoded text (tokens).

abstract decode_text(ids: List[int]) str[source]#

Decode tokens into text.

Parameters:

ids – The tokens to decode.

Returns:

The decoded tokens (text).

abstract special_token_id(sp: SpecialTokenEnum) int[source]#

Get the identifier of a special token.

Parameters:

sp – The special token’s enumerator.

Returns:

The special token’s identifier.

abstract token_to_id(t: str) int[source]#

Convert a string-based token to its identifier.

Parameters:

t – The string-based token.

Returns:

The token’s identifier.

abstract id_to_token(id: int) str[source]#

Convert a token identifier to its string-based representation.

Parameters:

id – The token’s identifier.

Returns:

The string-based token.

tokens_to_ids(ts: List[str]) List[int][source]#

Convert a list of string-based tokens to their corresponding identifiers.

Parameters:

ts – A list of string-based tokens.

Returns:

The identifiers corresponding to the input tokens.

ids_to_tokens(ids: List[int]) List[str][source]#

Convert a list of tokens’ identifiers to their string-based representations.

Parameters:

ids – A list of tokens’ identifiers.

Returns:

The string-based representations of the input tokens.

encode_file(path: str, verbose: bool | None = True) Tensor[source]#

Encode text from an input file.

This method reads text from the specified file and encodes it using the encode_text method. It also includes options for verbosity and efficiently handling large datasets by converting the encoded tokens to a torch.Tensor every 500k lines.

Parameters:
  • path – The path to the input file.

  • verbose – Whether to add verbosity to the logger.

Returns:

The encoded tokens.

Token Configuration#

class archai.datasets.nlp.tokenizer_utils.token_config.SpecialTokenEnum(value)[source]#

Enumerate special tokens.

UNK = 0#
BOS = 1#
EOS = 2#
PAD = 3#
MASK = 4#
class archai.datasets.nlp.tokenizer_utils.token_config.TokenConfig(bos_token: str | None = '<|endoftext|>', eos_token: str | None = '<|endoftext|>', unk_token: str | None = '<|endoftext|>', pad_token: str | None = None, add_prefix_space: bool | None = False, add_prefix_new_line: bool | None = False, lower_case: bool | None = False)[source]#

Store and access configuration options for special tokens, such as BOS, EOS, UNK, and PAD.

get_special_tokens() List[str][source]#

Return a list of all available special tokens.

Returns:

Special tokens.

special_token_name(sp: SpecialTokenEnum) str[source]#

Return the name of a special token.

Parameters:

sp – Special token enumerator.

Returns:

Special token name.

BBPE Tokenizer#

class archai.datasets.nlp.tokenizer_utils.bbpe_tokenizer.BbpeTokenizer(save_path: str, vocab_size: int, pad_vocab_size: bool | None = False, bos_token: str | None = '_BOS_', eos_token: str | None = None, unk_token: str | None = '_OOV_', pad_token: str | None = None, min_frequency: int | None = None, model_max_length: int | None = None, add_prefix_space: bool | None = True, add_prefix_new_line: bool | None = False, sorted_vocab: bool | None = False, encode_special_tokens: bool | None = False, decode_special_tokens: bool | None = False)[source]#

Byte-BPE-based tokenizer.

train(filepaths: List[str]) None[source]#

Train the tokenizer on a list of files.

Parameters:

filepaths – A list of paths to input files.

is_trained() bool[source]#

Check if the vocabulary has been trained.

Returns:

True if the vocabulary has been trained, False otherwise.

load() None[source]#

Load a pre-trained tokenizer.

encode_text(text: List[str] | str) List[int][source]#

Encode text into tokens.

Parameters:

text – The input text to encode.

Returns:

The encoded text (tokens).

decode_text(ids: List[int]) str[source]#

Decode tokens into text.

Parameters:

ids – The tokens to decode.

Returns:

The decoded tokens (text).

special_token_id(sp: SpecialTokenEnum) int[source]#

Get the identifier of a special token.

Parameters:

sp – The special token’s enumerator.

Returns:

The special token’s identifier.

token_to_id(t: str) int[source]#

Convert a string-based token to its identifier.

Parameters:

t – The string-based token.

Returns:

The token’s identifier.

id_to_token(id: int) str[source]#

Convert a token identifier to its string-based representation.

Parameters:

id – The token’s identifier.

Returns:

The string-based token.

GPT-2 Tokenizer#

class archai.datasets.nlp.tokenizer_utils.gpt2_tokenizer.Gpt2Tokenizer(save_path: str, vocab_size: int | None = 50257, pad_vocab_size: bool | None = True, bos_token: str | None = '<|endoftext|>', eos_token: str | None = '<|endoftext|>', unk_token: str | None = '<|unk|>', pad_token: str | None = None, min_frequency: int | None = None, model_max_length: int | None = 1024, add_prefix_space: bool | None = True, add_prefix_new_line: bool | None = True, sorted_vocab: bool | None = False)[source]#

GPT-2 based tokenizer.

Word-Based Tokenizer#

class archai.datasets.nlp.tokenizer_utils.word_tokenizer.WordTokenizer(save_path: str, vocab_size: int | None = None, bos_token: str | None = None, eos_token: str | None = '<eos>', unk_token: str | None = '<unk>', min_frequency: int | None = 0, lower_case: int | None = False, delimiter: str | None = None, encode_special_tokens: bool | None = True, decode_special_tokens: bool | None = True)[source]#

Word-based tokenizer.

train(filepaths: List[str]) None[source]#

Train the tokenizer on a list of files.

Parameters:

filepaths – A list of paths to input files.

is_trained() bool[source]#

Check if the vocabulary has been trained.

Returns:

True if the vocabulary has been trained, False otherwise.

load() None[source]#

Load a pre-trained tokenizer.

encode_text(text: str) List[int][source]#

Encode text into tokens.

Parameters:

text – The input text to encode.

Returns:

The encoded text (tokens).

decode_text(ids: List[int]) str[source]#

Decode tokens into text.

Parameters:

ids – The tokens to decode.

Returns:

The decoded tokens (text).

special_token_id(sp: SpecialTokenEnum) int[source]#

Get the identifier of a special token.

Parameters:

sp – The special token’s enumerator.

Returns:

The special token’s identifier.

token_to_id(t: str) int[source]#

Convert a string-based token to its identifier.

Parameters:

t – The string-based token.

Returns:

The token’s identifier.

id_to_token(id: int) str[source]#

Convert a token identifier to its string-based representation.

Parameters:

id – The token’s identifier.

Returns:

The string-based token.

tokens_to_ids(ts: List[str]) List[int][source]#

Convert a list of string-based tokens to their corresponding identifiers.

Parameters:

ts – A list of string-based tokens.

Returns:

The identifiers corresponding to the input tokens.

ids_to_tokens(ids: List[int]) List[str][source]#

Convert a list of tokens’ identifiers to their string-based representations.

Parameters:

ids – A list of tokens’ identifiers.

Returns:

The string-based representations of the input tokens.