Remove python

This commit is contained in:
Gabriel Tofvesson 2023-03-22 00:07:38 +01:00
parent 82facf911f
commit 85a4f9dbb0
No known key found for this signature in database
GPG Key ID: 6F1345DF28EDA13E
18 changed files with 135 additions and 1576 deletions

View File

@ -1,83 +0,0 @@
name: Build wheels
on: [push, pull_request, workflow_dispatch]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build_wheels:
name: py${{ matrix.python-version }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
# cibuildwheel builds linux wheels inside a manylinux container
# it also takes care of procuring the correct python version for us
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [38, 39, 310, 311]
steps:
- uses: actions/checkout@v3
- uses: pypa/cibuildwheel@v2.11.3
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./wheelhouse/*.whl
build_wheels_aarch64:
name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64)
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: [38, 39, 310, 311]
steps:
- uses: actions/checkout@v3
- name: Setup up QEMU
uses: docker/setup-qemu-action@v2
with:
platforms: arm64
- name: Build wheels
uses: pypa/cibuildwheel@v2.11.3
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
CIBW_ARCHS: aarch64
CIBW_BUILD_VERBOSITY: 3
# https://github.com/rust-lang/cargo/issues/10583
CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./wheelhouse/*.whl
build_sdist:
name: sdist
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Install Python
with:
python-version: "3.9"
- name: Run check-manifest
run: |
pip install check-manifest
check-manifest -v
- name: Build sdist
run: |
pip install --upgrade build
python -m build --sdist
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./dist/*.tar.gz

View File

@ -1,35 +0,0 @@
# Changelog
This is the changelog for the open source version of tiktoken.
## [v0.3.2]
- Add encoding for GPT-4
## [v0.3.1]
- Build aarch64 wheels
- Make `blobfile` an optional dependency
Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
## [v0.3.0]
- Improve performance by 5-20%; thank you to @nistath!
- Add `gpt-3.5-turbo` models to `encoding_for_model`
- Add prefix matching to `encoding_for_model` to better support future model versions
- Fix a bug in the README instructions on extending tiktoken
- Update the set of available encodings
- Add packaging metadata
## [v0.2.0]
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
- Improve portability of caching logic
Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
## [v0.1.2]
- Avoid use of `blobfile` for public files
- Add support for Python 3.8
- Add py.typed
- Improve the public tests
## [v0.1.1]
- Initial release

View File

@ -5,17 +5,14 @@ edition = "2021"
rust-version = "1.57.0"
[lib]
name = "_tiktoken"
crate-type = ["cdylib"]
name = "tiktoken"
[dependencies]
pyo3 = { version = "0.17.3", features = ["extension-module"] }
# tiktoken dependencies
fancy-regex = "0.10.0"
fancy-regex = "0.11.0"
regex = "1.7.0"
rustc-hash = "1.1.0"
bstr = "1.0.1"
anyhow = "1.0.70"
[profile.release]
incremental = true

View File

@ -1,8 +0,0 @@
include *.svg
include *.toml
include *.md
include Makefile
global-include py.typed
recursive-include scripts *.py
recursive-include tests *.py
recursive-include src *.rs

374
perf.svg
View File

@ -1,374 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="569.334pt" height="328.0869pt" viewBox="0 0 569.334 328.0869">
<rect width="100%" height="100%" fill="white"/>
<g enable-background="new">
<g>
<clipPath id="cp0">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
</clipPath>
<g clip-path="url(#cp0)">
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 219.5869 L 569 219.5869 "/>
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 150.5869 L 569 150.5869 "/>
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 82.58685 L 569 82.58685 "/>
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 13.58685 L 569 13.58685 "/>
</g>
<clipPath id="cp1">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 39.64996 L -.0000120107 314.4229 L 20.496 314.4229 L 20.49601 39.64996 Z "/>
</clipPath>
<g clip-path="url(#cp1)">
<clipPath id="cp2">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
</clipPath>
<g clip-path="url(#cp2)">
<text xml:space="preserve" transform="matrix(0 -1 1 0 11.10803 182.06452)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.888 13.560001 17.340003 24.228003 30.900004 37.788003 44.460004 51.576005 58.248006">Throughput</tspan></text>
</g>
</g>
<clipPath id="cp3">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 26.168 31.81796 L 67.843997 31.81796 L 67.843997 47.48196 L 26.168 47.48196 Z "/>
</clipPath>
<g clip-path="url(#cp3)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 27.668 292.71299)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 10.008001 20.460003 28.680005 32.676004">0 MB/s</tspan></text>
</g>
<clipPath id="cp4">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 100.5112 L 67.844 100.5112 L 67.844 116.1752 L 19.496 116.1752 Z "/>
</clipPath>
<g clip-path="url(#cp4)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 224.01972)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">10 MB/s</tspan></text>
</g>
<clipPath id="cp5">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 169.2044 L 67.844 169.2044 L 67.844 184.86841 L 19.496 184.86841 Z "/>
</clipPath>
<g clip-path="url(#cp5)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 155.3265)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">20 MB/s</tspan></text>
</g>
<clipPath id="cp6">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 237.8976 L 67.844 237.8976 L 67.844 253.5616 L 19.496 253.5616 Z "/>
</clipPath>
<g clip-path="url(#cp6)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 86.633319)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">30 MB/s</tspan></text>
</g>
<clipPath id="cp7">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 306.5909 L 67.844 306.5909 L 67.844 322.2549 L 19.496 322.2549 Z "/>
</clipPath>
<g clip-path="url(#cp7)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 17.940125)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">40 MB/s</tspan></text>
</g>
<clipPath id="cp8">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
</clipPath>
<g clip-path="url(#cp8)">
<path stroke-width="1" stroke-linecap="square" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#000000" d="M 78.5 288.5869 L 568.5 288.5869 "/>
</g>
<clipPath id="cp9">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 78.83396 0 L 568.834 0 L 568.834 20.496 L 78.83396 20.496 Z "/>
</clipPath>
<g clip-path="url(#cp9)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 288.266 325.53096)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.888 13.560001 17.340003 23.784003 30.228003 37.344 40.68 47.124 54.012 60.684003 67.356">Thread count</tspan></text>
</g>
<clipPath id="cp10">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 108.998 19.496 L 118.67 19.496 L 118.67 35.16 L 108.998 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp10)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 110.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">1</tspan></text>
</g>
<clipPath id="cp11">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 178.998 19.496 L 188.67 19.496 L 188.67 35.16 L 178.998 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp11)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 180.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">2</tspan></text>
</g>
<clipPath id="cp12">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 248.998 19.496 L 258.67 19.496 L 258.67 35.16 L 248.998 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp12)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 250.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">4</tspan></text>
</g>
<clipPath id="cp13">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 318.998 19.496 L 328.66999 19.496 L 328.66999 35.16 L 318.998 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp13)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 320.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">8</tspan></text>
</g>
<clipPath id="cp14">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 385.662 19.496 L 402.00599 19.496 L 402.00599 35.16 L 385.662 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp14)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 387.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">16</tspan></text>
</g>
<clipPath id="cp15">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 455.662 19.496 L 472.00599 19.496 L 472.00599 35.16 L 455.662 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp15)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 457.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">32</tspan></text>
</g>
<clipPath id="cp16">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 525.662 19.496 L 542.006 19.496 L 542.006 35.16 L 525.662 35.16 Z "/>
</clipPath>
<g clip-path="url(#cp16)">
<text xml:space="preserve" transform="matrix(1 0 -0 1 527.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">64</tspan></text>
</g>
<clipPath id="cp17">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 115 40 L 143 40 L 143 52 L 115 52 Z "/>
</clipPath>
<g clip-path="url(#cp17)">
<g>
<clipPath id="cp18">
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z "/>
</clipPath>
<g clip-path="url(#cp18)">
<clipPath id="cp19">
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z "/>
</clipPath>
<g clip-path="url(#cp19)">
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp20">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 185 40 L 213 40 L 213 61 L 185 61 Z "/>
</clipPath>
<g clip-path="url(#cp20)">
<g>
<clipPath id="cp21">
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z "/>
</clipPath>
<g clip-path="url(#cp21)">
<clipPath id="cp22">
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z "/>
</clipPath>
<g clip-path="url(#cp22)">
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp23">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 255 40 L 283 40 L 283 74 L 255 74 Z "/>
</clipPath>
<g clip-path="url(#cp23)">
<g>
<clipPath id="cp24">
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z "/>
</clipPath>
<g clip-path="url(#cp24)">
<clipPath id="cp25">
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z "/>
</clipPath>
<g clip-path="url(#cp25)">
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp26">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 325 40 L 353 40 L 353 80 L 325 80 Z "/>
</clipPath>
<g clip-path="url(#cp26)">
<g>
<clipPath id="cp27">
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z "/>
</clipPath>
<g clip-path="url(#cp27)">
<clipPath id="cp28">
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z "/>
</clipPath>
<g clip-path="url(#cp28)">
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp29">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 395 40 L 423 40 L 423 83 L 395 83 Z "/>
</clipPath>
<g clip-path="url(#cp29)">
<g>
<clipPath id="cp30">
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z "/>
</clipPath>
<g clip-path="url(#cp30)">
<clipPath id="cp31">
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z "/>
</clipPath>
<g clip-path="url(#cp31)">
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp32">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 465 40 L 493 40 L 493 85 L 465 85 Z "/>
</clipPath>
<g clip-path="url(#cp32)">
<g>
<clipPath id="cp33">
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z "/>
</clipPath>
<g clip-path="url(#cp33)">
<clipPath id="cp34">
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z "/>
</clipPath>
<g clip-path="url(#cp34)">
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp35">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 535 40 L 563 40 L 563 88 L 535 88 Z "/>
</clipPath>
<g clip-path="url(#cp35)">
<g>
<clipPath id="cp36">
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z "/>
</clipPath>
<g clip-path="url(#cp36)">
<clipPath id="cp37">
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z "/>
</clipPath>
<g clip-path="url(#cp37)">
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z " fill="#61d836"/>
</g>
</g>
</g>
</g>
<clipPath id="cp38">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 84 40 L 112 40 L 112 84 L 84 84 Z "/>
</clipPath>
<g clip-path="url(#cp38)">
<g>
<clipPath id="cp39">
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z "/>
</clipPath>
<g clip-path="url(#cp39)">
<clipPath id="cp40">
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z "/>
</clipPath>
<g clip-path="url(#cp40)">
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp41">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 154 40 L 182 40 L 182 123 L 154 123 Z "/>
</clipPath>
<g clip-path="url(#cp41)">
<g>
<clipPath id="cp42">
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z "/>
</clipPath>
<g clip-path="url(#cp42)">
<clipPath id="cp43">
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z "/>
</clipPath>
<g clip-path="url(#cp43)">
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp44">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 224 40 L 252 40 L 252 189 L 224 189 Z "/>
</clipPath>
<g clip-path="url(#cp44)">
<g>
<clipPath id="cp45">
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z "/>
</clipPath>
<g clip-path="url(#cp45)">
<clipPath id="cp46">
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z "/>
</clipPath>
<g clip-path="url(#cp46)">
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp47">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 294 40 L 322 40 L 322 258 L 294 258 Z "/>
</clipPath>
<g clip-path="url(#cp47)">
<g>
<clipPath id="cp48">
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z "/>
</clipPath>
<g clip-path="url(#cp48)">
<clipPath id="cp49">
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z "/>
</clipPath>
<g clip-path="url(#cp49)">
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp50">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 364 40 L 392 40 L 392 303 L 364 303 Z "/>
</clipPath>
<g clip-path="url(#cp50)">
<g>
<clipPath id="cp51">
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z "/>
</clipPath>
<g clip-path="url(#cp51)">
<clipPath id="cp52">
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z "/>
</clipPath>
<g clip-path="url(#cp52)">
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp53">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 434 40 L 462 40 L 462 203 L 434 203 Z "/>
</clipPath>
<g clip-path="url(#cp53)">
<g>
<clipPath id="cp54">
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z "/>
</clipPath>
<g clip-path="url(#cp54)">
<clipPath id="cp55">
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z "/>
</clipPath>
<g clip-path="url(#cp55)">
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp56">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 504 40 L 532 40 L 532 195 L 504 195 Z "/>
</clipPath>
<g clip-path="url(#cp56)">
<g>
<clipPath id="cp57">
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z "/>
</clipPath>
<g clip-path="url(#cp57)">
<clipPath id="cp58">
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z "/>
</clipPath>
<g clip-path="url(#cp58)">
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z " fill="#00a2ff"/>
</g>
</g>
</g>
</g>
<clipPath id="cp59">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
</clipPath>
<g clip-path="url(#cp59)">
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 459 291 L 471 291 L 471 280 L 459 280 Z " fill="#00a2ff"/>
<text xml:space="preserve" transform="matrix(1 0 -0 1 477.8753 46.772127)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 3.7800003 6.4440004 12.672001 16.452002 23.340003 29.568003 36.012">tiktoken</tspan></text>
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 459 278 L 471 278 L 471 266 L 459 266 Z " fill="#61d836"/>
<text xml:space="preserve" transform="matrix(1 0 -0 1 477.8753 60.436128)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 20.232003 27.120003 29.784003 36.456 43.344 46.896005 53.340005 59.784006">huggingface</tspan></text>
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 16 KiB

View File

@ -1,41 +0,0 @@
[project]
name = "tiktoken"
version = "0.3.2"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
optional-dependencies = {blobfile = ["blobfile>=2"]}
requires-python = ">=3.8"
[project.urls]
homepage = "https://github.com/openai/tiktoken"
repository = "https://github.com/openai/tiktoken"
changelog = "https://github.com/openai/tiktoken/blob/main/CHANGELOG.md"
[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
[tool.cibuildwheel]
build-frontend = "build"
build-verbosity = 1
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
macos.before-all = "rustup target add aarch64-apple-darwin"
skip = [
"*-manylinux_i686",
"*-musllinux_i686",
"*-win32",
]
macos.archs = ["x86_64", "arm64"]
# When cross-compiling on Intel, it is not possible to test arm64 wheels.
# Warnings will be silenced with following CIBW_TEST_SKIP
test-skip = "*-macosx_arm64"
before-test = "pip install pytest"
test-command = "pytest {project}/tests"

View File

@ -1,39 +0,0 @@
import base64
import functools
import gzip
import json
import os
import random
import time
from typing import Any, cast
import blobfile
import tiktoken
def benchmark_batch(documents: list[str]) -> None:
num_threads = int(os.environ["RAYON_NUM_THREADS"])
num_bytes = sum(map(len, map(str.encode, documents)))
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
enc = tiktoken.get_encoding("gpt2")
enc.encode("warmup")
start = time.perf_counter_ns()
enc.encode_ordinary_batch(documents, num_threads=num_threads)
end = time.perf_counter_ns()
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
import transformers
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
hf_enc.model_max_length = 1e30 # silence!
hf_enc.encode("warmup")
start = time.perf_counter_ns()
hf_enc(documents)
end = time.perf_counter_ns()
print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")

View File

@ -1,67 +0,0 @@
import argparse
import re
import subprocess
from pathlib import Path
def redact_file(path: Path, dry_run: bool) -> None:
if not path.exists() or path.is_dir():
return
text = path.read_text()
if not text:
return
first_line = text.splitlines()[0]
if "redact" in first_line:
if not dry_run:
path.unlink()
print(f"Deleted {path}")
return
pattern = "|".join(
re.escape(x)
for x in [
"# ===== redact-beg =====\n",
"# ===== redact-end =====\n",
"<!--- redact-beg -->\n",
"<!--- redact-end -->\n",
]
)
if re.search(pattern, text):
redacted_text = "".join(re.split(pattern, text)[::2])
if not dry_run:
path.write_text(redacted_text)
print(f"Redacted {path}")
return
print(f"Skipped {path}")
def redact(dry_run: bool) -> None:
tiktoken_root = Path(__file__).parent.parent
assert tiktoken_root.name == "tiktoken"
assert (tiktoken_root / "pyproject.toml").exists()
try:
output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
paths = [Path(p) for p in output.splitlines()]
except subprocess.CalledProcessError:
paths = list(tiktoken_root.glob("**/*"))
for path in paths:
redact_file(path, dry_run=dry_run)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
args = parser.parse_args()
redact(args.dry_run)
if args.dry_run:
print("Dry run, use --dry-run=false to actually redact files")
if __name__ == "__main__":
main()

View File

@ -1,18 +0,0 @@
from setuptools import setup
from setuptools_rust import Binding, RustExtension
setup(
name="tiktoken",
rust_extensions=[
RustExtension(
"tiktoken._tiktoken",
binding=Binding.PyO3,
# Between our use of editable installs and wanting to use Rust for performance sensitive
# code, it makes sense to just always use --release
debug=False,
)
],
package_data={"tiktoken": ["py.typed"]},
packages=["tiktoken", "tiktoken_ext"],
zip_safe=False,
)

View File

@ -5,10 +5,6 @@ use std::collections::HashSet;
use std::thread;
use fancy_regex::Regex;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyList, PyTuple};
use pyo3::PyResult;
use rustc_hash::FxHashMap as HashMap;
fn _byte_pair_merge<T>(
@ -169,7 +165,6 @@ fn hash_current_thread() -> usize {
}
const MAX_NUM_THREADS: usize = 128;
#[pyclass]
struct CoreBPE {
encoder: HashMap<Vec<u8>, usize>,
special_tokens_encoder: HashMap<String, usize>,
@ -192,19 +187,96 @@ impl CoreBPE {
&self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
}
fn _decode_native(&self, tokens: &[usize]) -> Vec<u8> {
let mut ret = Vec::with_capacity(tokens.len() * 2);
for token in tokens {
let token_bytes = self
.decoder
.get(token)
.unwrap_or_else(|| &self.special_tokens_decoder[token]);
ret.extend(token_bytes);
fn _increase_last_piece_token_len(
&self,
tokens: Vec<usize>,
mut last_piece_token_len: usize,
) -> (Vec<usize>, usize) {
// Unfortunately, the locations where our regex splits can be unstable.
// For the purposes of determining unstable tokens, unstable regex splitting
// is only a problem if a split that was present disappears, since this can
// lead to merging of tokens otherwise thought to be stable.
// cl100k_base makes our life hard by including the \s*[\r\n]+
// pattern. This can e.g. cause "\n" + " " to become "\n \n".
// Here is a quick and dirty fix:
{
let token_is_all_space = |token| {
self.decoder
.get(token)
.map(|token_bytes| {
token_bytes
.iter()
.rev()
.all(|&b| [b' ', b'\n', b'\t'].contains(&b))
})
.unwrap_or(false)
};
if last_piece_token_len > 0
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
{
while (last_piece_token_len < tokens.len())
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
{
last_piece_token_len += 1;
}
}
}
ret
debug_assert!(last_piece_token_len <= tokens.len());
(tokens, last_piece_token_len)
}
}
impl CoreBPE {
pub fn new(
encoder: HashMap<Vec<u8>, usize>,
special_tokens_encoder: HashMap<String, usize>,
pattern: &str,
) -> anyhow::Result<Self> {
let regex = Regex::new(pattern)
.map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?;
let special_regex = {
let _parts = special_tokens_encoder
.keys()
.map(|s| fancy_regex::escape(s))
.collect::<Vec<_>>();
Regex::new(&_parts.join("|"))
.map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?
};
let decoder: HashMap<usize, Vec<u8>> =
encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
assert!(encoder.len() == decoder.len());
let special_tokens_decoder: HashMap<usize, Vec<u8>> = special_tokens_encoder
.iter()
.map(|(k, v)| (*v, k.as_bytes().to_vec()))
.collect();
// Clone because I don't know how to tell Rust I'm not going to change the map
let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
sorted_token_bytes.sort();
Ok(CoreBPE {
encoder,
special_tokens_encoder,
decoder,
special_tokens_decoder,
regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
special_regex_tls: (0..MAX_NUM_THREADS)
.map(|_| special_regex.clone())
.collect(),
sorted_token_bytes,
})
}
fn _encode_ordinary_native(&self, text: &str) -> Vec<usize> {
// ====================
// Encoding
// ====================
pub fn encode_ordinary(&self, text: &str) -> Vec<usize> {
// This is the core of the encoding logic; the other functions in here
// just make things complicated :-)
let regex = self._get_tl_regex();
@ -220,7 +292,7 @@ impl CoreBPE {
ret
}
fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<usize>, usize) {
pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> (Vec<usize>, usize) {
let special_regex = self._get_tl_special_regex();
let regex = self._get_tl_regex();
let mut ret = vec![];
@ -276,51 +348,37 @@ impl CoreBPE {
(ret, last_piece_token_len)
}
fn _increase_last_piece_token_len(
&self,
tokens: Vec<usize>,
mut last_piece_token_len: usize,
) -> (Vec<usize>, usize) {
// Unfortunately, the locations where our regex splits can be unstable.
// For the purposes of determining unstable tokens, unstable regex splitting
// is only a problem if a split that was present disappears, since this can
// lead to merging of tokens otherwise thought to be stable.
// cl100k_base makes our life hard by including the \s*[\r\n]+
// pattern. This can e.g. cause "\n" + " " to become "\n \n".
// Here is a quick and dirty fix:
{
let token_is_all_space = |token| {
self.decoder
.get(token)
.map(|token_bytes| {
token_bytes
.iter()
.rev()
.all(|&b| [b' ', b'\n', b'\t'].contains(&b))
})
.unwrap_or(false)
};
if last_piece_token_len > 0
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
{
while (last_piece_token_len < tokens.len())
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
{
last_piece_token_len += 1;
fn _encode_bytes(&self, bytes: &[u8]) -> Vec<usize> {
match std::str::from_utf8(bytes) {
Ok(text) => self.encode_ordinary(text),
Err(e) => {
let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
let (tokens, last_piece_token_len) = self.encode(text, HashSet::new());
let (mut tokens, last_piece_token_len) =
self._increase_last_piece_token_len(tokens, last_piece_token_len);
if !tokens.is_empty() && last_piece_token_len > 0 {
// Lop off the tokens from the last piece and run BPE on the remaining bytes
// Somewhat niche, but this may not be correct if we'd have had a regex
// split between the valid UTF-8 and the invalid bytes, which is why this
// method is private
let mut unstable_bytes =
self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
tokens.truncate(tokens.len() - last_piece_token_len);
tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
}
tokens
}
}
debug_assert!(last_piece_token_len <= tokens.len());
(tokens, last_piece_token_len)
}
fn _encode_unstable_native(
pub fn encode_with_unstable(
&self,
text: &str,
allowed_special: &HashSet<&str>,
allowed_special: HashSet<&str>,
) -> (Vec<usize>, HashSet<Vec<usize>>) {
let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special);
let (tokens, last_piece_token_len) = self.encode(text, allowed_special);
if last_piece_token_len == 0 {
// If last_piece_token_len is zero, the last token was a special token and we have
// no unstable bytes
@ -329,7 +387,7 @@ impl CoreBPE {
let (mut tokens, last_piece_token_len) =
self._increase_last_piece_token_len(tokens, last_piece_token_len);
let unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
let unstable_bytes = self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
tokens.truncate(tokens.len() - last_piece_token_len);
// TODO: we should try harder to find additional stable tokens
@ -377,7 +435,7 @@ impl CoreBPE {
// So convert to UTF-8 and do regex splitting.
// E.g. with cl100k_base " !" gets split to " " + " !",
// but byte_pair_encode(" !") != byte_pair_encode(" ")
Ok(s) => self._encode_ordinary_native(s),
Ok(s) => self.encode_ordinary(s),
// Technically, whether or not this arm is correct depends on whether there
// would be a regex split before the UTF-8 truncation point.
@ -430,108 +488,8 @@ impl CoreBPE {
(tokens, completions)
}
}
#[pymethods]
impl CoreBPE {
#[new]
fn new(
encoder: HashMap<Vec<u8>, usize>,
special_tokens_encoder: HashMap<String, usize>,
pattern: &str,
) -> PyResult<Self> {
let regex = Regex::new(pattern)
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))?;
let special_regex = {
let _parts = special_tokens_encoder
.keys()
.map(|s| fancy_regex::escape(s))
.collect::<Vec<_>>();
Regex::new(&_parts.join("|"))
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))?
};
let decoder: HashMap<usize, Vec<u8>> =
encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
assert!(encoder.len() == decoder.len());
let special_tokens_decoder: HashMap<usize, Vec<u8>> = special_tokens_encoder
.iter()
.map(|(k, v)| (*v, k.as_bytes().to_vec()))
.collect();
// Clone because I don't know how to tell Rust I'm not going to change the map
let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
sorted_token_bytes.sort();
Ok(CoreBPE {
encoder,
special_tokens_encoder,
decoder,
special_tokens_decoder,
regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
special_regex_tls: (0..MAX_NUM_THREADS)
.map(|_| special_regex.clone())
.collect(),
sorted_token_bytes,
})
}
// ====================
// Encoding
// ====================
fn encode_ordinary(&self, py: Python, text: &str) -> Vec<usize> {
py.allow_threads(|| self._encode_ordinary_native(text))
}
fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec<usize> {
py.allow_threads(|| self._encode_native(text, &allowed_special).0)
}
fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<usize> {
py.allow_threads(|| {
match std::str::from_utf8(bytes) {
Ok(text) => self._encode_ordinary_native(text),
Err(e) => {
let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new());
let (mut tokens, last_piece_token_len) =
self._increase_last_piece_token_len(tokens, last_piece_token_len);
if !tokens.is_empty() && last_piece_token_len > 0 {
// Lop off the tokens from the last piece and run BPE on the remaining bytes
// Somewhat niche, but this may not be correct if we'd have had a regex
// split between the valid UTF-8 and the invalid bytes, which is why this
// method is private
let mut unstable_bytes =
self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
tokens.truncate(tokens.len() - last_piece_token_len);
tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
}
tokens
}
}
})
}
fn encode_with_unstable(
&self,
py: Python,
text: &str,
allowed_special: HashSet<&str>,
) -> Py<PyTuple> {
let (tokens, completions) =
py.allow_threads(|| self._encode_unstable_native(text, &allowed_special));
let py_completions =
PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..])));
(tokens, py_completions).into_py(py)
}
fn encode_single_token(&self, piece: &[u8]) -> PyResult<usize> {
pub fn encode_single_token(&self, piece: &[u8]) -> anyhow::Result<usize> {
if let Some(token) = self.encoder.get(piece).copied() {
return Ok(token);
}
@ -540,10 +498,10 @@ impl CoreBPE {
return Ok(token);
}
}
Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
Err(anyhow::anyhow!("Piece {:?} not found", piece))
}
fn encode_single_piece(&self, piece: &[u8]) -> Vec<usize> {
pub fn encode_single_piece(&self, piece: &[u8]) -> Vec<usize> {
if let Some(token) = self.encoder.get(piece) {
return vec![*token];
}
@ -554,39 +512,37 @@ impl CoreBPE {
// Decoding
// ====================
fn decode_bytes(&self, py: Python, tokens: Vec<usize>) -> Py<PyBytes> {
let bytes = py.allow_threads(|| self._decode_native(&tokens));
PyBytes::new(py, &bytes).into()
pub fn decode_bytes(&self, tokens: &[usize]) -> Vec<u8> {
let mut ret = Vec::with_capacity(tokens.len() * 2);
for token in tokens {
let token_bytes = self
.decoder
.get(token)
.unwrap_or_else(|| &self.special_tokens_decoder[token]);
ret.extend(token_bytes);
}
ret
}
fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult<Py<PyBytes>> {
pub fn decode_single_token_bytes(&self, token: usize) -> anyhow::Result<&Vec<u8>> {
if let Some(bytes) = self.decoder.get(&token) {
return Ok(PyBytes::new(py, bytes).into());
return Ok(bytes);
}
if let Some(bytes) = self.special_tokens_decoder.get(&token) {
return Ok(PyBytes::new(py, bytes).into());
return Ok(bytes);
}
Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
Err(anyhow::anyhow!("Token {} not found", token))
}
// ====================
// Miscellaneous
// ====================
fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
self.sorted_token_bytes
.iter()
.map(|x| PyBytes::new(py, x).into())
.collect()
pub fn token_byte_values(&self) -> &Vec<Vec<u8>> {
&self.sorted_token_bytes
}
}
#[pymodule]
fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<CoreBPE>()?;
Ok(())
}
#[cfg(test)]
mod tests {
use rustc_hash::FxHashMap as HashMap;

View File

@ -1,42 +0,0 @@
import subprocess
import sys
import tiktoken
def test_simple():
# Note that there are more actual tests, they're just not currently public :-)
enc = tiktoken.get_encoding("gpt2")
assert enc.encode("hello world") == [31373, 995]
assert enc.decode([31373, 995]) == "hello world"
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
enc = tiktoken.get_encoding("cl100k_base")
assert enc.encode("hello world") == [15339, 1917]
assert enc.decode([15339, 1917]) == "hello world"
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
for enc_name in tiktoken.list_encoding_names():
enc = tiktoken.get_encoding(enc_name)
for token in range(10_000):
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
def test_encoding_for_model():
enc = tiktoken.encoding_for_model("gpt2")
assert enc.name == "gpt2"
enc = tiktoken.encoding_for_model("text-davinci-003")
assert enc.name == "p50k_base"
enc = tiktoken.encoding_for_model("text-davinci-edit-001")
assert enc.name == "p50k_edit"
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
assert enc.name == "cl100k_base"
def test_optional_blobfile_dependency():
prog = """
import tiktoken
import sys
assert "blobfile" not in sys.modules
"""
subprocess.check_call([sys.executable, "-c", prog])

View File

@ -1,4 +0,0 @@
from .core import Encoding as Encoding
from .model import encoding_for_model as encoding_for_model
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names

View File

@ -1,329 +0,0 @@
from __future__ import annotations
import functools
from concurrent.futures import ThreadPoolExecutor
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
import regex
from tiktoken import _tiktoken
class Encoding:
def __init__(
self,
name: str,
*,
pat_str: str,
mergeable_ranks: dict[bytes, int],
special_tokens: dict[str, int],
explicit_n_vocab: Optional[int] = None,
):
"""Creates an Encoding object.
See openai_public.py for examples of how to construct an Encoding object.
Args:
name: The name of the encoding. It should be clear from the name of the encoding
what behaviour to expect, in particular, encodings with different special tokens
should have different names.
pat_str: A regex pattern string that is used to split the input text.
mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
must correspond to merge priority.
special_tokens: A dictionary mapping special token strings to their token values.
explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
that the number of mergeable tokens and special tokens is equal to this number.
"""
self.name = name
self._pat_str = pat_str
self._mergeable_ranks = mergeable_ranks
self._special_tokens = special_tokens
self.max_token_value = max(
max(mergeable_ranks.values()), max(special_tokens.values(), default=0)
)
if explicit_n_vocab:
assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
assert self.max_token_value == explicit_n_vocab - 1
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
def __repr__(self) -> str:
return f"<Encoding {self.name!r}>"
# ====================
# Encoding
# ====================
def encode_ordinary(self, text: str) -> list[int]:
"""Encodes a string into tokens, ignoring special tokens.
This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).
```
>>> enc.encode_ordinary("hello world")
[31373, 995]
"""
return self._core_bpe.encode_ordinary(text)
def encode(
self,
text: str,
*,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> list[int]:
"""Encodes a string into tokens.
Special tokens are artificial tokens used to unlock capabilities from a model,
such as fill-in-the-middle. So we want to be careful about accidentally encoding special
tokens, since they can be used to trick a model into doing something we don't want it to do.
Hence, by default, encode will raise an error if it encounters text that corresponds
to a special token. This can be controlled on a per-token level using the `allowed_special`
and `disallowed_special` parameters. In particular:
- Setting `disallowed_special` to () will prevent this function from raising errors and
cause all text corresponding to special tokens to be encoded as natural text.
- Setting `allowed_special` to "all" will cause this function to treat all text
corresponding to special tokens to be encoded as special tokens.
```
>>> enc.encode("hello world")
[31373, 995]
>>> enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})
[50256]
>>> enc.encode("<|endoftext|>", allowed_special="all")
[50256]
>>> enc.encode("<|endoftext|>")
# Raises ValueError
>>> enc.encode("<|endoftext|>", disallowed_special=())
[27, 91, 437, 1659, 5239, 91, 29]
```
"""
if allowed_special == "all":
allowed_special = self.special_tokens_set
if disallowed_special == "all":
disallowed_special = self.special_tokens_set - allowed_special
if disallowed_special:
if not isinstance(disallowed_special, frozenset):
disallowed_special = frozenset(disallowed_special)
if match := _special_token_regex(disallowed_special).search(text):
raise_disallowed_special_token(match.group())
return self._core_bpe.encode(text, allowed_special)
def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
"""Encodes a list of strings into tokens, in parallel, ignoring special tokens.
This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).
```
>>> enc.encode_ordinary_batch(["hello world", "goodbye world"])
[[31373, 995], [11274, 16390, 995]]
```
"""
encoder = functools.partial(self.encode_ordinary)
with ThreadPoolExecutor(num_threads) as e:
return list(e.map(encoder, text))
def encode_batch(
self,
text: list[str],
*,
num_threads: int = 8,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> list[list[int]]:
"""Encodes a list of strings into tokens, in parallel.
See `encode` for more details on `allowed_special` and `disallowed_special`.
```
>>> enc.encode_batch(["hello world", "goodbye world"])
[[31373, 995], [11274, 16390, 995]]
```
"""
if allowed_special == "all":
allowed_special = self.special_tokens_set
if disallowed_special == "all":
disallowed_special = self.special_tokens_set - allowed_special
if not isinstance(disallowed_special, frozenset):
disallowed_special = frozenset(disallowed_special)
encoder = functools.partial(
self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special
)
with ThreadPoolExecutor(num_threads) as e:
return list(e.map(encoder, text))
def encode_with_unstable(
self,
text: str,
*,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> tuple[list[int], list[list[int]]]:
"""Encodes a string into stable tokens and possible completion sequences.
Note that the stable tokens will only represent a substring of `text`.
See `encode` for more details on `allowed_special` and `disallowed_special`.
This API should itself be considered unstable.
```
>>> enc.encode_with_unstable("hello fanta")
([31373], [(277, 4910), (5113, 265), ..., (8842,)])
>>> text = "..."
>>> stable_tokens, completions = enc.encode_with_unstable(text)
>>> assert text.encode().startswith(enc.decode_bytes(stable_tokens))
>>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)
```
"""
if allowed_special == "all":
allowed_special = self.special_tokens_set
if disallowed_special == "all":
disallowed_special = self.special_tokens_set - allowed_special
if disallowed_special:
if not isinstance(disallowed_special, frozenset):
disallowed_special = frozenset(disallowed_special)
if match := _special_token_regex(disallowed_special).search(text):
raise_disallowed_special_token(match.group())
return self._core_bpe.encode_with_unstable(text, allowed_special)
def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int:
"""Encodes text corresponding to a single token to its token value.
NOTE: this will encode all special tokens.
Raises `KeyError` if the token is not in the vocabulary.
```
>>> enc.encode_single_token("hello")
31373
```
"""
if isinstance(text_or_bytes, str):
text_or_bytes = text_or_bytes.encode("utf-8")
return self._core_bpe.encode_single_token(text_or_bytes)
# ====================
# Decoding
# ====================
def decode_bytes(self, tokens: list[int]) -> bytes:
"""Decodes a list of tokens into bytes.
```
>>> enc.decode_bytes([31373, 995])
b'hello world'
```
"""
return self._core_bpe.decode_bytes(tokens)
def decode(self, tokens: list[int], errors: str = "replace") -> str:
"""Decodes a list of tokens into a string.
WARNING: the default behaviour of this function is lossy, since decoded bytes are not
guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter,
for instance, setting `errors=strict`.
```
>>> enc.decode([31373, 995])
'hello world'
```
"""
return self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
def decode_single_token_bytes(self, token: int) -> bytes:
"""Decodes a token into bytes.
NOTE: this will decode all special tokens.
Raises `KeyError` if the token is not in the vocabulary.
```
>>> enc.decode_single_token_bytes(31373)
b'hello'
```
"""
return self._core_bpe.decode_single_token_bytes(token)
def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
"""Decodes a list of tokens into a list of bytes.
Useful for visualising tokenisation.
>>> enc.decode_tokens_bytes([31373, 995])
[b'hello', b' world']
"""
return [self.decode_single_token_bytes(token) for token in tokens]
# ====================
# Miscellaneous
# ====================
def token_byte_values(self) -> list[bytes]:
"""Returns the list of all token byte values."""
return self._core_bpe.token_byte_values()
@property
def eot_token(self) -> int:
return self._special_tokens["<|endoftext|>"]
@functools.cached_property
def special_tokens_set(self) -> set[str]:
return set(self._special_tokens.keys())
@property
def n_vocab(self) -> int:
"""For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
return self.max_token_value + 1
# ====================
# Private
# ====================
def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
"""Encodes text corresponding to bytes without a regex split.
NOTE: this will not encode any special tokens.
```
>>> enc.encode_single_piece("helloqqqq")
[31373, 38227, 38227]
```
"""
if isinstance(text_or_bytes, str):
text_or_bytes = text_or_bytes.encode("utf-8")
return self._core_bpe.encode_single_piece(text_or_bytes)
def _encode_only_native_bpe(self, text: str) -> list[int]:
"""Encodes a string into tokens, but do regex splitting in Python."""
_unused_pat = regex.compile(self._pat_str)
ret = []
for piece in regex.findall(_unused_pat, text):
ret.extend(self._core_bpe.encode_single_piece(piece))
return ret
def _encode_bytes(self, text: bytes) -> list[int]:
return self._core_bpe._encode_bytes(text)
@functools.lru_cache(maxsize=128)
def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
inner = "|".join(regex.escape(token) for token in tokens)
return regex.compile(f"({inner})")
def raise_disallowed_special_token(token: str) -> NoReturn:
raise ValueError(
f"Encountered text corresponding to disallowed special token {token!r}.\n"
"If you want this text to be encoded as a special token, "
f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n"
f"If you want this text to be encoded as normal text, disable the check for this token "
f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n"
"To disable this check for all special tokens, pass `disallowed_special=()`.\n"
)

View File

@ -1,118 +0,0 @@
from __future__ import annotations
import base64
import hashlib
import json
import os
import tempfile
import uuid
import requests
def read_file(blobpath: str) -> bytes:
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
try:
import blobfile
except ImportError:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
return requests.get(blobpath).content
def read_file_cached(blobpath: str) -> bytes:
if "TIKTOKEN_CACHE_DIR" in os.environ:
cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
elif "DATA_GYM_CACHE_DIR" in os.environ:
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
else:
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
if cache_dir == "":
# disable caching
return read_file(blobpath)
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
cache_path = os.path.join(cache_dir, cache_key)
if os.path.exists(cache_path):
with open(cache_path, "rb") as f:
return f.read()
contents = read_file(blobpath)
os.makedirs(cache_dir, exist_ok=True)
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
with open(tmp_filename, "wb") as f:
f.write(contents)
os.rename(tmp_filename, cache_path)
return contents
def data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file: str, encoder_json_file: str
) -> dict[bytes, int]:
# NB: do not add caching to this function
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte}
n = 0
for b in range(2**8):
if b not in rank_to_intbyte:
rank_to_intbyte.append(b)
data_gym_byte_to_byte[chr(2**8 + n)] = b
n += 1
assert len(rank_to_intbyte) == 2**8
# vocab_bpe contains the merges along with associated ranks
vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode()
bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
def decode_data_gym(value: str) -> bytes:
return bytes(data_gym_byte_to_byte[b] for b in value)
# add the single byte tokens
bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}
# add the merged tokens
n = len(bpe_ranks)
for first, second in bpe_merges:
bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n
n += 1
# check that the encoder file matches the merges file
# this sanity check is important since tiktoken assumes that ranks are ordered the same
# as merge priority
encoder_json = json.loads(read_file_cached(encoder_json_file))
encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
# drop these two special tokens if present, since they're not mergeable bpe tokens
encoder_json_loaded.pop(b"<|endoftext|>", None)
encoder_json_loaded.pop(b"<|startoftext|>", None)
assert bpe_ranks == encoder_json_loaded
return bpe_ranks
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
try:
import blobfile
except ImportError:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
# NB: do not add caching to this function
contents = read_file_cached(tiktoken_bpe_file)
return {
base64.b64decode(token): int(rank)
for token, rank in (line.split() for line in contents.splitlines() if line)
}

View File

@ -1,75 +0,0 @@
from __future__ import annotations
from .core import Encoding
from .registry import get_encoding
# TODO: these will likely be replaced by an API endpoint
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
}
MODEL_TO_ENCODING: dict[str, str] = {
# chat
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
# text
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-001": "r50k_base",
"text-curie-001": "r50k_base",
"text-babbage-001": "r50k_base",
"text-ada-001": "r50k_base",
"davinci": "r50k_base",
"curie": "r50k_base",
"babbage": "r50k_base",
"ada": "r50k_base",
# code
"code-davinci-002": "p50k_base",
"code-davinci-001": "p50k_base",
"code-cushman-002": "p50k_base",
"code-cushman-001": "p50k_base",
"davinci-codex": "p50k_base",
"cushman-codex": "p50k_base",
# edit
"text-davinci-edit-001": "p50k_edit",
"code-davinci-edit-001": "p50k_edit",
# embeddings
"text-embedding-ada-002": "cl100k_base",
# old embeddings
"text-similarity-davinci-001": "r50k_base",
"text-similarity-curie-001": "r50k_base",
"text-similarity-babbage-001": "r50k_base",
"text-similarity-ada-001": "r50k_base",
"text-search-davinci-doc-001": "r50k_base",
"text-search-curie-doc-001": "r50k_base",
"text-search-babbage-doc-001": "r50k_base",
"text-search-ada-doc-001": "r50k_base",
"code-search-babbage-code-001": "r50k_base",
"code-search-ada-code-001": "r50k_base",
# open source
"gpt2": "gpt2",
}
def encoding_for_model(model_name: str) -> Encoding:
"""Returns the encoding used by a model."""
encoding_name = None
if model_name in MODEL_TO_ENCODING:
encoding_name = MODEL_TO_ENCODING[model_name]
else:
# Check if the model matches a known prefix
# Prefix matching avoids needing library updates for every model version release
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
if model_name.startswith(model_prefix):
return get_encoding(model_encoding_name)
if encoding_name is None:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
) from None
return get_encoding(encoding_name)

View File

View File

@ -1,73 +0,0 @@
from __future__ import annotations
import importlib
import pkgutil
import threading
from typing import Any, Callable, Optional
import tiktoken_ext
from tiktoken.core import Encoding
_lock = threading.RLock()
ENCODINGS: dict[str, Encoding] = {}
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
def _find_constructors() -> None:
global ENCODING_CONSTRUCTORS
with _lock:
if ENCODING_CONSTRUCTORS is not None:
return
ENCODING_CONSTRUCTORS = {}
# tiktoken_ext is a namespace package
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
# - it's a separate top-level package because namespace subpackages of non-namespace
# packages don't quite do what you want with editable installs
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
for _, mod_name, _ in plugin_mods:
mod = importlib.import_module(mod_name)
try:
constructors = mod.ENCODING_CONSTRUCTORS
except AttributeError as e:
raise ValueError(
f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS"
) from e
for enc_name, constructor in constructors.items():
if enc_name in ENCODING_CONSTRUCTORS:
raise ValueError(
f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}"
)
ENCODING_CONSTRUCTORS[enc_name] = constructor
def get_encoding(encoding_name: str) -> Encoding:
if encoding_name in ENCODINGS:
return ENCODINGS[encoding_name]
with _lock:
if encoding_name in ENCODINGS:
return ENCODINGS[encoding_name]
if ENCODING_CONSTRUCTORS is None:
_find_constructors()
assert ENCODING_CONSTRUCTORS is not None
if encoding_name not in ENCODING_CONSTRUCTORS:
raise ValueError(f"Unknown encoding {encoding_name}")
constructor = ENCODING_CONSTRUCTORS[encoding_name]
enc = Encoding(**constructor())
ENCODINGS[encoding_name] = enc
return enc
def list_encoding_names() -> list[str]:
with _lock:
if ENCODING_CONSTRUCTORS is None:
_find_constructors()
assert ENCODING_CONSTRUCTORS is not None
return list(ENCODING_CONSTRUCTORS)

View File

@ -1,88 +0,0 @@
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
ENDOFTEXT = "<|endoftext|>"
FIM_PREFIX = "<|fim_prefix|>"
FIM_MIDDLE = "<|fim_middle|>"
FIM_SUFFIX = "<|fim_suffix|>"
ENDOFPROMPT = "<|endofprompt|>"
def gpt2():
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
)
return {
"name": "gpt2",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {"<|endoftext|>": 50256},
}
def r50k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
)
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
def p50k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
)
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}
def p50k_edit():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
)
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
return {
"name": "p50k_edit",
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
def cl100k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
)
special_tokens = {
ENDOFTEXT: 100257,
FIM_PREFIX: 100258,
FIM_MIDDLE: 100259,
FIM_SUFFIX: 100260,
ENDOFPROMPT: 100276,
}
return {
"name": "cl100k_base",
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
ENCODING_CONSTRUCTORS = {
"gpt2": gpt2,
"r50k_base": r50k_base,
"p50k_base": p50k_base,
"p50k_edit": p50k_edit,
"cl100k_base": cl100k_base,
}