Build wheels; update codebase
This commit is contained in:
parent
a1a9f16826
commit
1f098ca4d7
53
.github/workflows/build_wheels.yml
vendored
Normal file
53
.github/workflows/build_wheels.yml
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
name: Build wheels
|
||||
|
||||
on: [push, pull_request, workflow_dispatch]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: py${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# cibuildwheel builds linux wheels inside a manylinux container
|
||||
# it also takes care of procuring the correct python version for us
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python-version: [39, 310, 311]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- uses: pypa/cibuildwheel@v2.11.3
|
||||
env:
|
||||
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_sdist:
|
||||
name: sdist
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: "3.9"
|
||||
- name: Run check-manifest
|
||||
run: |
|
||||
pip install check-manifest
|
||||
check-manifest -v
|
||||
- name: Build sdist
|
||||
run: |
|
||||
pip install --upgrade build
|
||||
python -m build --sdist
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: ./dist/*.tar.gz
|
@ -2,4 +2,5 @@ include *.svg
|
||||
include *.toml
|
||||
include Makefile
|
||||
recursive-include scripts *.py
|
||||
recursive-include tests *.py
|
||||
recursive-include src *.rs
|
||||
|
@ -18,7 +18,7 @@ The tokeniser API is documented in `tiktoken/core.py`.
|
||||
|
||||
## Performance
|
||||
|
||||
`tiktoken` is between 3-6x faster than huggingface's tokeniser:
|
||||
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:
|
||||
|
||||

|
||||
|
||||
|
1
perf.svg
1
perf.svg
@ -1,6 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="569.334pt" height="328.0869pt" viewBox="0 0 569.334 328.0869">
|
||||
<rect width="100%" height="100%" fill="white"/>
|
||||
<g enable-background="new">
|
||||
<g>
|
||||
<clipPath id="cp0">
|
||||
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 16 KiB |
@ -2,7 +2,30 @@
|
||||
name = "tiktoken"
|
||||
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
|
||||
dynamic = ["version"]
|
||||
requires-python = ">=3.9"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel", "setuptools-rust"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build-frontend = "build"
|
||||
build-verbosity = 1
|
||||
|
||||
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
|
||||
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
|
||||
macos.before-all = "rustup target add aarch64-apple-darwin"
|
||||
|
||||
skip = [
|
||||
"*-manylinux_i686",
|
||||
"*-musllinux_i686",
|
||||
"*-win32",
|
||||
]
|
||||
macos.archs = ["x86_64", "arm64"]
|
||||
# When cross-compiling on Intel, it is not possible to test arm64 wheels.
|
||||
# Warnings will be silenced with following CIBW_TEST_SKIP
|
||||
test-skip = "*-macosx_arm64"
|
||||
|
||||
before-test = "pip install pytest"
|
||||
test-command = "pytest {project}/tests"
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension
|
||||
public = True
|
||||
|
||||
if public:
|
||||
version = "0.1"
|
||||
version = "0.1.1"
|
||||
|
||||
setup(
|
||||
name="tiktoken",
|
||||
|
11
tests/test_simple_public.py
Normal file
11
tests/test_simple_public.py
Normal file
@ -0,0 +1,11 @@
|
||||
import tiktoken
|
||||
|
||||
|
||||
def test_simple():
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
assert enc.encode("hello world") == [31373, 995]
|
||||
assert enc.decode([31373, 995]) == "hello world"
|
||||
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
assert enc.encode("hello world") == [15339, 1917]
|
||||
assert enc.decode([15339, 1917]) == "hello world"
|
@ -153,6 +153,8 @@ class Encoding:
|
||||
|
||||
See `encode` for more details on `allowed_special` and `disallowed_special`.
|
||||
|
||||
This API should itself be considered unstable.
|
||||
|
||||
```
|
||||
>>> enc.encode_with_unstable("hello fanta")
|
||||
([31373], [(277, 4910), (5113, 265), ..., (8842,)])
|
||||
|
@ -21,6 +21,28 @@ def gpt2():
|
||||
}
|
||||
|
||||
|
||||
def r50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
|
||||
return {
|
||||
"name": "r50k_base",
|
||||
"explicit_n_vocab": 50257,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
|
||||
|
||||
def p50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
|
||||
return {
|
||||
"name": "p50k_base",
|
||||
"explicit_n_vocab": 50281,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
|
||||
|
||||
def cl100k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
|
||||
special_tokens = {
|
||||
@ -38,4 +60,9 @@ def cl100k_base():
|
||||
}
|
||||
|
||||
|
||||
ENCODING_CONSTRUCTORS = {"gpt2": gpt2, "cl100k_base": cl100k_base}
|
||||
ENCODING_CONSTRUCTORS = {
|
||||
"gpt2": gpt2,
|
||||
"r50k_base": r50k_base,
|
||||
"p50k_base": p50k_base,
|
||||
"cl100k_base": cl100k_base,
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user