diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index b71f959..d2e8dc2 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -16,7 +16,7 @@ jobs: # cibuildwheel builds linux wheels inside a manylinux container # it also takes care of procuring the correct python version for us os: [ubuntu-latest, windows-latest, macos-latest] - python-version: [39, 310, 311] + python-version: [38, 39, 310, 311] steps: - uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a606553 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +This is the changelog for the open source version of tiktoken. + +## [v0.1.2] +- Avoid use of `blobfile` for public files +- Add support for Python 3.8 +- Add py.typed +- Improve the public tests + +## [v0.1.1] +- Initial release diff --git a/MANIFEST.in b/MANIFEST.in index 558a5ec..7f25b27 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,8 @@ include *.svg include *.toml +include *.md include Makefile +global-include py.typed recursive-include scripts *.py recursive-include tests *.py recursive-include src *.rs diff --git a/pyproject.toml b/pyproject.toml index 0d4327b..4fcd2ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [project] name = "tiktoken" -dependencies = ["blobfile>=2", "regex>=2022.1.18"] +dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"] dynamic = ["version"] -requires-python = ">=3.9" +requires-python = ">=3.8" [build-system] build-backend = "setuptools.build_meta" diff --git a/scripts/redact.py b/scripts/redact.py index bcf8ef1..d82db32 100644 --- a/scripts/redact.py +++ b/scripts/redact.py @@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None: return text = path.read_text() + if not text: + return first_line = text.splitlines()[0] if "redact" in first_line: diff --git a/setup.py b/setup.py index d7f9373..179392b 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension public = True if public: - version = "0.1.1" + version = "0.1.2" setup( name="tiktoken", @@ -18,6 +18,7 @@ setup( debug=False, ) ], + package_data={"tiktoken": ["py.typed"]}, packages=["tiktoken", "tiktoken_ext"], zip_safe=False, ) diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py index 5b69162..2987e4e 100644 --- a/tests/test_simple_public.py +++ b/tests/test_simple_public.py @@ -2,10 +2,18 @@ import tiktoken def test_simple(): + # Note that there are more actual tests, they're just not currently public :-) enc = tiktoken.get_encoding("gpt2") assert enc.encode("hello world") == [31373, 995] assert enc.decode([31373, 995]) == "hello world" + assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("hello world") == [15339, 1917] assert enc.decode([15339, 1917]) == "hello world" + assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] + + for enc_name in tiktoken.list_encoding_names(): + enc = tiktoken.get_encoding(enc_name) + for token in range(10_000): + assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token diff --git a/tiktoken/core.py b/tiktoken/core.py index c566a52..d2367bc 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import functools from concurrent.futures import ThreadPoolExecutor from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union diff --git a/tiktoken/load.py b/tiktoken/load.py index 06e51cc..fefd62a 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import base64 import hashlib import json @@ -5,6 +7,15 @@ import os import uuid import blobfile +import requests + + +def read_file(blobpath: str) -> bytes: + if not blobpath.startswith("http://") and not blobpath.startswith("https://"): + with blobfile.BlobFile(blobpath, "rb") as f: + return f.read() + # avoiding blobfile for public files helps avoid auth issues, like MFA prompts + return requests.get(blobpath).content def read_file_cached(blobpath: str) -> bytes: @@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes: if cache_dir == "": # disable caching - with blobfile.BlobFile(blobpath, "rb") as f: - return f.read() + return read_file(blobpath) cache_key = hashlib.sha1(blobpath.encode()).hexdigest() @@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes: with open(cache_path, "rb") as f: return f.read() - with blobfile.BlobFile(blobpath, "rb") as f: - contents = f.read() + contents = read_file(blobpath) os.makedirs(cache_dir, exist_ok=True) tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp" diff --git a/tiktoken/py.typed b/tiktoken/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/tiktoken/registry.py b/tiktoken/registry.py index 24bb173..52d8ec2 100644 --- a/tiktoken/registry.py +++ b/tiktoken/registry.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib import pkgutil import threading diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 756be2d..b0ec6ab 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -9,8 +9,8 @@ ENDOFPROMPT = "<|endofprompt|>" def gpt2(): mergeable_ranks = data_gym_to_mergeable_bpe_ranks( - vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe", - encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json", + vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", + encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", ) return { "name": "gpt2", @@ -22,7 +22,9 @@ def gpt2(): def r50k_base(): - mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken") + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" + ) return { "name": "r50k_base", "explicit_n_vocab": 50257, @@ -33,7 +35,9 @@ def r50k_base(): def p50k_base(): - mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken") + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" + ) return { "name": "p50k_base", "explicit_n_vocab": 50281, @@ -44,7 +48,9 @@ def p50k_base(): def cl100k_base(): - mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken") + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" + ) special_tokens = { ENDOFTEXT: 100257, FIM_PREFIX: 100258,