42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
|
|
|
|
ENDOFTEXT = "<|endoftext|>"
|
|
FIM_PREFIX = "<|fim_prefix|>"
|
|
FIM_MIDDLE = "<|fim_middle|>"
|
|
FIM_SUFFIX = "<|fim_suffix|>"
|
|
ENDOFPROMPT = "<|endofprompt|>"
|
|
|
|
|
|
def gpt2():
|
|
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
|
vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
|
|
encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
|
|
)
|
|
return {
|
|
"name": "gpt2",
|
|
"explicit_n_vocab": 50257,
|
|
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
|
"mergeable_ranks": mergeable_ranks,
|
|
"special_tokens": {"<|endoftext|>": 50256},
|
|
}
|
|
|
|
|
|
def cl100k_base():
|
|
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
|
|
special_tokens = {
|
|
ENDOFTEXT: 100257,
|
|
FIM_PREFIX: 100258,
|
|
FIM_MIDDLE: 100259,
|
|
FIM_SUFFIX: 100260,
|
|
ENDOFPROMPT: 100276,
|
|
}
|
|
return {
|
|
"name": "cl100k_base",
|
|
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
|
|
"mergeable_ranks": mergeable_ranks,
|
|
"special_tokens": special_tokens,
|
|
}
|
|
|
|
|
|
ENCODING_CONSTRUCTORS = {"gpt2": gpt2, "cl100k_base": cl100k_base}
|