Source code for archai.datasets.nlp.tokenizer_utils.gpt2_tokenizer
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Optional
from archai.datasets.nlp.tokenizer_utils.bbpe_tokenizer import BbpeTokenizer
[docs]class Gpt2Tokenizer(BbpeTokenizer):
"""GPT-2 based tokenizer."""
def __init__(
self,
save_path: str,
vocab_size: Optional[int] = 50257,
pad_vocab_size: Optional[bool] = True,
bos_token: Optional[str] = "<|endoftext|>",
eos_token: Optional[str] = "<|endoftext|>",
unk_token: Optional[str] = "<|unk|>",
pad_token: Optional[str] = None,
min_frequency: Optional[int] = None,
model_max_length: Optional[int] = 1024,
add_prefix_space: Optional[bool] = True,
add_prefix_new_line: Optional[bool] = True,
sorted_vocab: Optional[bool] = False,
) -> None:
"""Define the tokenization pipeline.
Args:
save_path: Path to save the tokenizer.
vocab_size: Maximum size of vocabulary.
pad_vocab_size: Whether vocabulary size should be padded to a multiple of 8.
bos_token: Begin-of-sentence token.
eos_token: End-of-sentence token.
unk_token: Unknown token.
pad_token: Padding token.
min_frequency: Minimum frequency of tokens.
model_max_length: Maximum length of sequence.
add_prefix_space: Whether a prefix space token should be added.
add_prefix_new_line: Whether a prefix new line token should be added.
sorted_vocab: Whether vocabulary should be sorted.
"""
# GPT2Tokenizer
# vocab_size: 50257
# bos = eos = unk = '<|endoftext|>'
# sep_token = None
# max_model_input_sizes: {'gpt2': 1024, 'gpt2-medium': 1024, 'gpt2-large': 1024}
# max_len = max_len_sentence_pair = max_len_single_sentence = 1024
# mask_token = None
# default vocab size for GPT-2 is 50257
super().__init__(
save_path=save_path,
vocab_size=vocab_size,
pad_vocab_size=pad_vocab_size,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
min_frequency=min_frequency,
model_max_length=model_max_length,
add_prefix_space=add_prefix_space,
add_prefix_new_line=add_prefix_new_line,
sorted_vocab=sorted_vocab,
)