Build wheels; update codebase

This commit is contained in:
Shantanu Jain 2022-12-14 17:02:16 -08:00
parent a1a9f16826
commit 1f098ca4d7
9 changed files with 122 additions and 4 deletions

53
.github/workflows/build_wheels.yml vendored Normal file
View File

@ -0,0 +1,53 @@
name: Build wheels
on: [push, pull_request, workflow_dispatch]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build_wheels:
name: py${{ matrix.python-version }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
# cibuildwheel builds linux wheels inside a manylinux container
# it also takes care of procuring the correct python version for us
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [39, 310, 311]
steps:
- uses: actions/checkout@v3
- uses: pypa/cibuildwheel@v2.11.3
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./wheelhouse/*.whl
build_sdist:
name: sdist
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Install Python
with:
python-version: "3.9"
- name: Run check-manifest
run: |
pip install check-manifest
check-manifest -v
- name: Build sdist
run: |
pip install --upgrade build
python -m build --sdist
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./dist/*.tar.gz

View File

@ -2,4 +2,5 @@ include *.svg
include *.toml
include Makefile
recursive-include scripts *.py
recursive-include tests *.py
recursive-include src *.rs

View File

@ -18,7 +18,7 @@ The tokeniser API is documented in `tiktoken/core.py`.
## Performance
`tiktoken` is between 3-6x faster than huggingface's tokeniser:
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:
![image](./perf.svg)

View File

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="569.334pt" height="328.0869pt" viewBox="0 0 569.334 328.0869">
<rect width="100%" height="100%" fill="white"/>
<g enable-background="new">
<g>
<clipPath id="cp0">

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

@ -2,7 +2,30 @@
name = "tiktoken"
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
dynamic = ["version"]
requires-python = ">=3.9"
[build-system]
requires = ["setuptools", "wheel", "setuptools-rust"]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
[tool.cibuildwheel]
build-frontend = "build"
build-verbosity = 1
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
macos.before-all = "rustup target add aarch64-apple-darwin"
skip = [
"*-manylinux_i686",
"*-musllinux_i686",
"*-win32",
]
macos.archs = ["x86_64", "arm64"]
# When cross-compiling on Intel, it is not possible to test arm64 wheels.
# Warnings will be silenced with following CIBW_TEST_SKIP
test-skip = "*-macosx_arm64"
before-test = "pip install pytest"
test-command = "pytest {project}/tests"

View File

@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension
public = True
if public:
version = "0.1"
version = "0.1.1"
setup(
name="tiktoken",

View File

@ -0,0 +1,11 @@
import tiktoken
def test_simple():
enc = tiktoken.get_encoding("gpt2")
assert enc.encode("hello world") == [31373, 995]
assert enc.decode([31373, 995]) == "hello world"
enc = tiktoken.get_encoding("cl100k_base")
assert enc.encode("hello world") == [15339, 1917]
assert enc.decode([15339, 1917]) == "hello world"

View File

@ -153,6 +153,8 @@ class Encoding:
See `encode` for more details on `allowed_special` and `disallowed_special`.
This API should itself be considered unstable.
```
>>> enc.encode_with_unstable("hello fanta")
([31373], [(277, 4910), (5113, 265), ..., (8842,)])

View File

@ -21,6 +21,28 @@ def gpt2():
}
def r50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
def p50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
def cl100k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
special_tokens = {
@ -38,4 +60,9 @@ def cl100k_base():
}
ENCODING_CONSTRUCTORS = {"gpt2": gpt2, "cl100k_base": cl100k_base}
ENCODING_CONSTRUCTORS = {
"gpt2": gpt2,
"r50k_base": r50k_base,
"p50k_base": p50k_base,
"cl100k_base": cl100k_base,
}