From 1f098ca4d7d84025e94f2e84795fad713f8e6f3f Mon Sep 17 00:00:00 2001 From: Shantanu Jain Date: Wed, 14 Dec 2022 17:02:16 -0800 Subject: [PATCH] Build wheels; update codebase --- .github/workflows/build_wheels.yml | 53 ++++++++++++++++++++++++++++++ MANIFEST.in | 1 + README.md | 2 +- perf.svg | 1 + pyproject.toml | 25 +++++++++++++- setup.py | 2 +- tests/test_simple_public.py | 11 +++++++ tiktoken/core.py | 2 ++ tiktoken_ext/openai_public.py | 29 +++++++++++++++- 9 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build_wheels.yml create mode 100644 tests/test_simple_public.py diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml new file mode 100644 index 0000000..b71f959 --- /dev/null +++ b/.github/workflows/build_wheels.yml @@ -0,0 +1,53 @@ +name: Build wheels + +on: [push, pull_request, workflow_dispatch] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build_wheels: + name: py${{ matrix.python-version }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # cibuildwheel builds linux wheels inside a manylinux container + # it also takes care of procuring the correct python version for us + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: [39, 310, 311] + + steps: + - uses: actions/checkout@v3 + + - uses: pypa/cibuildwheel@v2.11.3 + env: + CIBW_BUILD: "cp${{ matrix.python-version}}-*" + + - uses: actions/upload-artifact@v3 + with: + name: dist + path: ./wheelhouse/*.whl + + build_sdist: + name: sdist + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + name: Install Python + with: + python-version: "3.9" + - name: Run check-manifest + run: | + pip install check-manifest + check-manifest -v + - name: Build sdist + run: | + pip install --upgrade build + python -m build --sdist + - uses: actions/upload-artifact@v3 + with: + name: dist + path: ./dist/*.tar.gz diff --git a/MANIFEST.in b/MANIFEST.in index cb017cd..558a5ec 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,4 +2,5 @@ include *.svg include *.toml include Makefile recursive-include scripts *.py +recursive-include tests *.py recursive-include src *.rs diff --git a/README.md b/README.md index f0ea386..cc1099c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The tokeniser API is documented in `tiktoken/core.py`. ## Performance -`tiktoken` is between 3-6x faster than huggingface's tokeniser: +`tiktoken` is between 3-6x faster than a comparable open source tokeniser: ![image](./perf.svg) diff --git a/perf.svg b/perf.svg index 7157ef9..723036c 100644 --- a/perf.svg +++ b/perf.svg @@ -1,6 +1,7 @@ + diff --git a/pyproject.toml b/pyproject.toml index bb9aeeb..0d4327b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,30 @@ name = "tiktoken" dependencies = ["blobfile>=2", "regex>=2022.1.18"] dynamic = ["version"] +requires-python = ">=3.9" [build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] +build-backend = "setuptools.build_meta" +requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"] + +[tool.cibuildwheel] +build-frontend = "build" +build-verbosity = 1 + +linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" +linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" } +macos.before-all = "rustup target add aarch64-apple-darwin" + +skip = [ + "*-manylinux_i686", + "*-musllinux_i686", + "*-win32", +] +macos.archs = ["x86_64", "arm64"] +# When cross-compiling on Intel, it is not possible to test arm64 wheels. +# Warnings will be silenced with following CIBW_TEST_SKIP +test-skip = "*-macosx_arm64" + +before-test = "pip install pytest" +test-command = "pytest {project}/tests" diff --git a/setup.py b/setup.py index df18eda..d7f9373 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension public = True if public: - version = "0.1" + version = "0.1.1" setup( name="tiktoken", diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py new file mode 100644 index 0000000..5b69162 --- /dev/null +++ b/tests/test_simple_public.py @@ -0,0 +1,11 @@ +import tiktoken + + +def test_simple(): + enc = tiktoken.get_encoding("gpt2") + assert enc.encode("hello world") == [31373, 995] + assert enc.decode([31373, 995]) == "hello world" + + enc = tiktoken.get_encoding("cl100k_base") + assert enc.encode("hello world") == [15339, 1917] + assert enc.decode([15339, 1917]) == "hello world" diff --git a/tiktoken/core.py b/tiktoken/core.py index e200c29..e04f425 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -153,6 +153,8 @@ class Encoding: See `encode` for more details on `allowed_special` and `disallowed_special`. + This API should itself be considered unstable. + ``` >>> enc.encode_with_unstable("hello fanta") ([31373], [(277, 4910), (5113, 265), ..., (8842,)]) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index cc6ad3c..756be2d 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -21,6 +21,28 @@ def gpt2(): } +def r50k_base(): + mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken") + return { + "name": "r50k_base", + "explicit_n_vocab": 50257, + "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "mergeable_ranks": mergeable_ranks, + "special_tokens": {ENDOFTEXT: 50256}, + } + + +def p50k_base(): + mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken") + return { + "name": "p50k_base", + "explicit_n_vocab": 50281, + "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "mergeable_ranks": mergeable_ranks, + "special_tokens": {ENDOFTEXT: 50256}, + } + + def cl100k_base(): mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken") special_tokens = { @@ -38,4 +60,9 @@ def cl100k_base(): } -ENCODING_CONSTRUCTORS = {"gpt2": gpt2, "cl100k_base": cl100k_base} +ENCODING_CONSTRUCTORS = { + "gpt2": gpt2, + "r50k_base": r50k_base, + "p50k_base": p50k_base, + "cl100k_base": cl100k_base, +}