Remove python
This commit is contained in:
parent
82facf911f
commit
85a4f9dbb0
83
.github/workflows/build_wheels.yml
vendored
83
.github/workflows/build_wheels.yml
vendored
@ -1,83 +0,0 @@
|
||||
name: Build wheels
|
||||
|
||||
on: [push, pull_request, workflow_dispatch]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: py${{ matrix.python-version }} on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# cibuildwheel builds linux wheels inside a manylinux container
|
||||
# it also takes care of procuring the correct python version for us
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python-version: [38, 39, 310, 311]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- uses: pypa/cibuildwheel@v2.11.3
|
||||
env:
|
||||
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
|
||||
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_wheels_aarch64:
|
||||
name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64)
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: [38, 39, 310, 311]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Setup up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
with:
|
||||
platforms: arm64
|
||||
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.11.3
|
||||
env:
|
||||
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
|
||||
CIBW_ARCHS: aarch64
|
||||
CIBW_BUILD_VERBOSITY: 3
|
||||
# https://github.com/rust-lang/cargo/issues/10583
|
||||
CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_sdist:
|
||||
name: sdist
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: "3.9"
|
||||
- name: Run check-manifest
|
||||
run: |
|
||||
pip install check-manifest
|
||||
check-manifest -v
|
||||
- name: Build sdist
|
||||
run: |
|
||||
pip install --upgrade build
|
||||
python -m build --sdist
|
||||
- uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: ./dist/*.tar.gz
|
35
CHANGELOG.md
35
CHANGELOG.md
@ -1,35 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
This is the changelog for the open source version of tiktoken.
|
||||
|
||||
## [v0.3.2]
|
||||
- Add encoding for GPT-4
|
||||
|
||||
## [v0.3.1]
|
||||
- Build aarch64 wheels
|
||||
- Make `blobfile` an optional dependency
|
||||
|
||||
Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
|
||||
|
||||
## [v0.3.0]
|
||||
- Improve performance by 5-20%; thank you to @nistath!
|
||||
- Add `gpt-3.5-turbo` models to `encoding_for_model`
|
||||
- Add prefix matching to `encoding_for_model` to better support future model versions
|
||||
- Fix a bug in the README instructions on extending tiktoken
|
||||
- Update the set of available encodings
|
||||
- Add packaging metadata
|
||||
|
||||
## [v0.2.0]
|
||||
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
||||
- Improve portability of caching logic
|
||||
|
||||
Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
|
||||
|
||||
## [v0.1.2]
|
||||
- Avoid use of `blobfile` for public files
|
||||
- Add support for Python 3.8
|
||||
- Add py.typed
|
||||
- Improve the public tests
|
||||
|
||||
## [v0.1.1]
|
||||
- Initial release
|
@ -5,17 +5,14 @@ edition = "2021"
|
||||
rust-version = "1.57.0"
|
||||
|
||||
[lib]
|
||||
name = "_tiktoken"
|
||||
crate-type = ["cdylib"]
|
||||
name = "tiktoken"
|
||||
|
||||
[dependencies]
|
||||
pyo3 = { version = "0.17.3", features = ["extension-module"] }
|
||||
|
||||
# tiktoken dependencies
|
||||
fancy-regex = "0.10.0"
|
||||
fancy-regex = "0.11.0"
|
||||
regex = "1.7.0"
|
||||
rustc-hash = "1.1.0"
|
||||
bstr = "1.0.1"
|
||||
anyhow = "1.0.70"
|
||||
|
||||
[profile.release]
|
||||
incremental = true
|
||||
|
@ -1,8 +0,0 @@
|
||||
include *.svg
|
||||
include *.toml
|
||||
include *.md
|
||||
include Makefile
|
||||
global-include py.typed
|
||||
recursive-include scripts *.py
|
||||
recursive-include tests *.py
|
||||
recursive-include src *.rs
|
374
perf.svg
374
perf.svg
@ -1,374 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="569.334pt" height="328.0869pt" viewBox="0 0 569.334 328.0869">
|
||||
<rect width="100%" height="100%" fill="white"/>
|
||||
<g enable-background="new">
|
||||
<g>
|
||||
<clipPath id="cp0">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp0)">
|
||||
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 219.5869 L 569 219.5869 "/>
|
||||
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 150.5869 L 569 150.5869 "/>
|
||||
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 82.58685 L 569 82.58685 "/>
|
||||
<path stroke-width=".5" stroke-linecap="butt" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#b8b8b8" d="M 79 13.58685 L 569 13.58685 "/>
|
||||
</g>
|
||||
<clipPath id="cp1">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 39.64996 L -.0000120107 314.4229 L 20.496 314.4229 L 20.49601 39.64996 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp1)">
|
||||
<clipPath id="cp2">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp2)">
|
||||
<text xml:space="preserve" transform="matrix(0 -1 1 0 11.10803 182.06452)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.888 13.560001 17.340003 24.228003 30.900004 37.788003 44.460004 51.576005 58.248006">Throughput</tspan></text>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp3">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 26.168 31.81796 L 67.843997 31.81796 L 67.843997 47.48196 L 26.168 47.48196 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp3)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 27.668 292.71299)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 10.008001 20.460003 28.680005 32.676004">0 MB/s</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp4">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 100.5112 L 67.844 100.5112 L 67.844 116.1752 L 19.496 116.1752 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp4)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 224.01972)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">10 MB/s</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp5">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 169.2044 L 67.844 169.2044 L 67.844 184.86841 L 19.496 184.86841 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp5)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 155.3265)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">20 MB/s</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp6">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 237.8976 L 67.844 237.8976 L 67.844 253.5616 L 19.496 253.5616 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp6)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 86.633319)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">30 MB/s</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp7">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 19.496 306.5909 L 67.844 306.5909 L 67.844 322.2549 L 19.496 322.2549 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp7)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 20.996 17.940125)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 16.680003 27.132004 35.352006 39.348005">40 MB/s</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp8">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp8)">
|
||||
<path stroke-width="1" stroke-linecap="square" stroke-miterlimit="4" stroke-linejoin="miter" fill="none" stroke="#000000" d="M 78.5 288.5869 L 568.5 288.5869 "/>
|
||||
</g>
|
||||
<clipPath id="cp9">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 78.83396 0 L 568.834 0 L 568.834 20.496 L 78.83396 20.496 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp9)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 288.266 325.53096)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.888 13.560001 17.340003 23.784003 30.228003 37.344 40.68 47.124 54.012 60.684003 67.356">Thread count</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp10">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 108.998 19.496 L 118.67 19.496 L 118.67 35.16 L 108.998 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp10)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 110.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">1</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp11">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 178.998 19.496 L 188.67 19.496 L 188.67 35.16 L 178.998 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp11)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 180.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">2</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp12">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 248.998 19.496 L 258.67 19.496 L 258.67 35.16 L 248.998 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp12)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 250.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">4</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp13">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 318.998 19.496 L 328.66999 19.496 L 328.66999 35.16 L 318.998 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp13)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 320.498 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0">8</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp14">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 385.662 19.496 L 402.00599 19.496 L 402.00599 35.16 L 385.662 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp14)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 387.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">16</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp15">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 455.662 19.496 L 472.00599 19.496 L 472.00599 35.16 L 455.662 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp15)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 457.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">32</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp16">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 525.662 19.496 L 542.006 19.496 L 542.006 35.16 L 525.662 35.16 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp16)">
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 527.162 305.03495)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001">64</tspan></text>
|
||||
</g>
|
||||
<clipPath id="cp17">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 115 40 L 143 40 L 143 52 L 115 52 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp17)">
|
||||
<g>
|
||||
<clipPath id="cp18">
|
||||
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp18)">
|
||||
<clipPath id="cp19">
|
||||
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp19)">
|
||||
<path transform="matrix(1,0,0,1,115,276.0869)" d="M 0 0 L 28 0 L 28 12 L 0 12 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp20">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 185 40 L 213 40 L 213 61 L 185 61 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp20)">
|
||||
<g>
|
||||
<clipPath id="cp21">
|
||||
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp21)">
|
||||
<clipPath id="cp22">
|
||||
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp22)">
|
||||
<path transform="matrix(1,0,0,1,185,267.0869)" d="M 0 0 L 28 0 L 28 21 L 0 21 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp23">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 255 40 L 283 40 L 283 74 L 255 74 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp23)">
|
||||
<g>
|
||||
<clipPath id="cp24">
|
||||
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp24)">
|
||||
<clipPath id="cp25">
|
||||
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp25)">
|
||||
<path transform="matrix(1,0,0,1,255,254.08692)" d="M 0 0 L 28 0 L 28 34 L 0 34 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp26">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 325 40 L 353 40 L 353 80 L 325 80 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp26)">
|
||||
<g>
|
||||
<clipPath id="cp27">
|
||||
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp27)">
|
||||
<clipPath id="cp28">
|
||||
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp28)">
|
||||
<path transform="matrix(1,0,0,1,325,248.08692)" d="M 0 0 L 28 0 L 28 40 L 0 40 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp29">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 395 40 L 423 40 L 423 83 L 395 83 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp29)">
|
||||
<g>
|
||||
<clipPath id="cp30">
|
||||
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp30)">
|
||||
<clipPath id="cp31">
|
||||
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp31)">
|
||||
<path transform="matrix(1,0,0,1,395,245.08692)" d="M 0 0 L 28 0 L 28 43 L 0 43 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp32">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 465 40 L 493 40 L 493 85 L 465 85 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp32)">
|
||||
<g>
|
||||
<clipPath id="cp33">
|
||||
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp33)">
|
||||
<clipPath id="cp34">
|
||||
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp34)">
|
||||
<path transform="matrix(1,0,0,1,465,243.08692)" d="M 0 0 L 28 0 L 28 45 L 0 45 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp35">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 535 40 L 563 40 L 563 88 L 535 88 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp35)">
|
||||
<g>
|
||||
<clipPath id="cp36">
|
||||
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp36)">
|
||||
<clipPath id="cp37">
|
||||
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp37)">
|
||||
<path transform="matrix(1,0,0,1,535,240.08692)" d="M 0 0 L 28 0 L 28 48 L 0 48 Z " fill="#61d836"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp38">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 84 40 L 112 40 L 112 84 L 84 84 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp38)">
|
||||
<g>
|
||||
<clipPath id="cp39">
|
||||
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp39)">
|
||||
<clipPath id="cp40">
|
||||
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp40)">
|
||||
<path transform="matrix(1,0,0,1,84,244.08692)" d="M 0 0 L 28 0 L 28 44 L 0 44 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp41">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 154 40 L 182 40 L 182 123 L 154 123 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp41)">
|
||||
<g>
|
||||
<clipPath id="cp42">
|
||||
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp42)">
|
||||
<clipPath id="cp43">
|
||||
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp43)">
|
||||
<path transform="matrix(1,0,0,1,154,205.08692)" d="M 0 0 L 28 0 L 28 83 L 0 83 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp44">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 224 40 L 252 40 L 252 189 L 224 189 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp44)">
|
||||
<g>
|
||||
<clipPath id="cp45">
|
||||
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp45)">
|
||||
<clipPath id="cp46">
|
||||
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp46)">
|
||||
<path transform="matrix(1,0,0,1,224,139.08692)" d="M 0 0 L 28 0 L 28 149 L 0 149 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp47">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 294 40 L 322 40 L 322 258 L 294 258 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp47)">
|
||||
<g>
|
||||
<clipPath id="cp48">
|
||||
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp48)">
|
||||
<clipPath id="cp49">
|
||||
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp49)">
|
||||
<path transform="matrix(1,0,0,1,294,70.086917)" d="M 0 0 L 28 0 L 28 218 L 0 218 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp50">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 364 40 L 392 40 L 392 303 L 364 303 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp50)">
|
||||
<g>
|
||||
<clipPath id="cp51">
|
||||
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp51)">
|
||||
<clipPath id="cp52">
|
||||
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp52)">
|
||||
<path transform="matrix(1,0,0,1,364,25.086915)" d="M 0 0 L 28 0 L 28 263 L 0 263 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp53">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 434 40 L 462 40 L 462 203 L 434 203 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp53)">
|
||||
<g>
|
||||
<clipPath id="cp54">
|
||||
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp54)">
|
||||
<clipPath id="cp55">
|
||||
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp55)">
|
||||
<path transform="matrix(1,0,0,1,434,125.086917)" d="M 0 0 L 28 0 L 28 163 L 0 163 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp56">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 504 40 L 532 40 L 532 195 L 504 195 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp56)">
|
||||
<g>
|
||||
<clipPath id="cp57">
|
||||
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp57)">
|
||||
<clipPath id="cp58">
|
||||
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp58)">
|
||||
<path transform="matrix(1,0,0,1,504,133.08692)" d="M 0 0 L 28 0 L 28 155 L 0 155 Z " fill="#00a2ff"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<clipPath id="cp59">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 0 0 L 569.334 0 L 569.334 328.0869 L 0 328.0869 Z "/>
|
||||
</clipPath>
|
||||
<g clip-path="url(#cp59)">
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 459 291 L 471 291 L 471 280 L 459 280 Z " fill="#00a2ff"/>
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 477.8753 46.772127)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 3.7800003 6.4440004 12.672001 16.452002 23.340003 29.568003 36.012">tiktoken</tspan></text>
|
||||
<path transform="matrix(1,0,0,-1,0,328.0869)" d="M 459 278 L 471 278 L 471 266 L 459 266 Z " fill="#61d836"/>
|
||||
<text xml:space="preserve" transform="matrix(1 0 -0 1 477.8753 60.436128)" font-size="12" font-family="HelveticaNeue"><tspan y="0" x="0 6.672001 13.344002 20.232003 27.120003 29.784003 36.456 43.344 46.896005 53.340005 59.784006">huggingface</tspan></text>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 16 KiB |
@ -1,41 +0,0 @@
|
||||
[project]
|
||||
name = "tiktoken"
|
||||
version = "0.3.2"
|
||||
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
||||
readme = "README.md"
|
||||
license = {file = "LICENSE"}
|
||||
authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
|
||||
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
|
||||
optional-dependencies = {blobfile = ["blobfile>=2"]}
|
||||
requires-python = ">=3.8"
|
||||
|
||||
[project.urls]
|
||||
homepage = "https://github.com/openai/tiktoken"
|
||||
repository = "https://github.com/openai/tiktoken"
|
||||
changelog = "https://github.com/openai/tiktoken/blob/main/CHANGELOG.md"
|
||||
|
||||
[build-system]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build-frontend = "build"
|
||||
build-verbosity = 1
|
||||
|
||||
linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
|
||||
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
|
||||
macos.before-all = "rustup target add aarch64-apple-darwin"
|
||||
|
||||
skip = [
|
||||
"*-manylinux_i686",
|
||||
"*-musllinux_i686",
|
||||
"*-win32",
|
||||
]
|
||||
macos.archs = ["x86_64", "arm64"]
|
||||
# When cross-compiling on Intel, it is not possible to test arm64 wheels.
|
||||
# Warnings will be silenced with following CIBW_TEST_SKIP
|
||||
test-skip = "*-macosx_arm64"
|
||||
|
||||
before-test = "pip install pytest"
|
||||
test-command = "pytest {project}/tests"
|
||||
|
@ -1,39 +0,0 @@
|
||||
import base64
|
||||
import functools
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import Any, cast
|
||||
|
||||
import blobfile
|
||||
|
||||
import tiktoken
|
||||
|
||||
|
||||
def benchmark_batch(documents: list[str]) -> None:
|
||||
num_threads = int(os.environ["RAYON_NUM_THREADS"])
|
||||
num_bytes = sum(map(len, map(str.encode, documents)))
|
||||
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
|
||||
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
enc.encode("warmup")
|
||||
|
||||
start = time.perf_counter_ns()
|
||||
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
||||
end = time.perf_counter_ns()
|
||||
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
|
||||
|
||||
import transformers
|
||||
|
||||
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
hf_enc.model_max_length = 1e30 # silence!
|
||||
hf_enc.encode("warmup")
|
||||
|
||||
start = time.perf_counter_ns()
|
||||
hf_enc(documents)
|
||||
end = time.perf_counter_ns()
|
||||
print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
|
||||
|
||||
|
@ -1,67 +0,0 @@
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def redact_file(path: Path, dry_run: bool) -> None:
|
||||
if not path.exists() or path.is_dir():
|
||||
return
|
||||
|
||||
text = path.read_text()
|
||||
if not text:
|
||||
return
|
||||
|
||||
first_line = text.splitlines()[0]
|
||||
if "redact" in first_line:
|
||||
if not dry_run:
|
||||
path.unlink()
|
||||
print(f"Deleted {path}")
|
||||
return
|
||||
|
||||
pattern = "|".join(
|
||||
re.escape(x)
|
||||
for x in [
|
||||
"# ===== redact-beg =====\n",
|
||||
"# ===== redact-end =====\n",
|
||||
"<!--- redact-beg -->\n",
|
||||
"<!--- redact-end -->\n",
|
||||
]
|
||||
)
|
||||
|
||||
if re.search(pattern, text):
|
||||
redacted_text = "".join(re.split(pattern, text)[::2])
|
||||
if not dry_run:
|
||||
path.write_text(redacted_text)
|
||||
print(f"Redacted {path}")
|
||||
return
|
||||
|
||||
print(f"Skipped {path}")
|
||||
|
||||
|
||||
def redact(dry_run: bool) -> None:
|
||||
tiktoken_root = Path(__file__).parent.parent
|
||||
assert tiktoken_root.name == "tiktoken"
|
||||
assert (tiktoken_root / "pyproject.toml").exists()
|
||||
|
||||
try:
|
||||
output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
|
||||
paths = [Path(p) for p in output.splitlines()]
|
||||
except subprocess.CalledProcessError:
|
||||
paths = list(tiktoken_root.glob("**/*"))
|
||||
|
||||
for path in paths:
|
||||
redact_file(path, dry_run=dry_run)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
|
||||
args = parser.parse_args()
|
||||
redact(args.dry_run)
|
||||
if args.dry_run:
|
||||
print("Dry run, use --dry-run=false to actually redact files")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
18
setup.py
18
setup.py
@ -1,18 +0,0 @@
|
||||
from setuptools import setup
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
|
||||
setup(
|
||||
name="tiktoken",
|
||||
rust_extensions=[
|
||||
RustExtension(
|
||||
"tiktoken._tiktoken",
|
||||
binding=Binding.PyO3,
|
||||
# Between our use of editable installs and wanting to use Rust for performance sensitive
|
||||
# code, it makes sense to just always use --release
|
||||
debug=False,
|
||||
)
|
||||
],
|
||||
package_data={"tiktoken": ["py.typed"]},
|
||||
packages=["tiktoken", "tiktoken_ext"],
|
||||
zip_safe=False,
|
||||
)
|
308
src/lib.rs
308
src/lib.rs
@ -5,10 +5,6 @@ use std::collections::HashSet;
|
||||
use std::thread;
|
||||
|
||||
use fancy_regex::Regex;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::{PyBytes, PyList, PyTuple};
|
||||
use pyo3::PyResult;
|
||||
use rustc_hash::FxHashMap as HashMap;
|
||||
|
||||
fn _byte_pair_merge<T>(
|
||||
@ -169,7 +165,6 @@ fn hash_current_thread() -> usize {
|
||||
}
|
||||
|
||||
const MAX_NUM_THREADS: usize = 128;
|
||||
#[pyclass]
|
||||
struct CoreBPE {
|
||||
encoder: HashMap<Vec<u8>, usize>,
|
||||
special_tokens_encoder: HashMap<String, usize>,
|
||||
@ -192,19 +187,96 @@ impl CoreBPE {
|
||||
&self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
|
||||
}
|
||||
|
||||
fn _decode_native(&self, tokens: &[usize]) -> Vec<u8> {
|
||||
let mut ret = Vec::with_capacity(tokens.len() * 2);
|
||||
for token in tokens {
|
||||
let token_bytes = self
|
||||
.decoder
|
||||
.get(token)
|
||||
.unwrap_or_else(|| &self.special_tokens_decoder[token]);
|
||||
ret.extend(token_bytes);
|
||||
fn _increase_last_piece_token_len(
|
||||
&self,
|
||||
tokens: Vec<usize>,
|
||||
mut last_piece_token_len: usize,
|
||||
) -> (Vec<usize>, usize) {
|
||||
// Unfortunately, the locations where our regex splits can be unstable.
|
||||
// For the purposes of determining unstable tokens, unstable regex splitting
|
||||
// is only a problem if a split that was present disappears, since this can
|
||||
// lead to merging of tokens otherwise thought to be stable.
|
||||
// cl100k_base makes our life hard by including the \s*[\r\n]+
|
||||
// pattern. This can e.g. cause "\n" + " " to become "\n \n".
|
||||
// Here is a quick and dirty fix:
|
||||
{
|
||||
let token_is_all_space = |token| {
|
||||
self.decoder
|
||||
.get(token)
|
||||
.map(|token_bytes| {
|
||||
token_bytes
|
||||
.iter()
|
||||
.rev()
|
||||
.all(|&b| [b' ', b'\n', b'\t'].contains(&b))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
};
|
||||
if last_piece_token_len > 0
|
||||
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
|
||||
{
|
||||
while (last_piece_token_len < tokens.len())
|
||||
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
|
||||
{
|
||||
last_piece_token_len += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ret
|
||||
debug_assert!(last_piece_token_len <= tokens.len());
|
||||
|
||||
(tokens, last_piece_token_len)
|
||||
}
|
||||
}
|
||||
|
||||
impl CoreBPE {
|
||||
pub fn new(
|
||||
encoder: HashMap<Vec<u8>, usize>,
|
||||
special_tokens_encoder: HashMap<String, usize>,
|
||||
pattern: &str,
|
||||
) -> anyhow::Result<Self> {
|
||||
let regex = Regex::new(pattern)
|
||||
.map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?;
|
||||
|
||||
let special_regex = {
|
||||
let _parts = special_tokens_encoder
|
||||
.keys()
|
||||
.map(|s| fancy_regex::escape(s))
|
||||
.collect::<Vec<_>>();
|
||||
Regex::new(&_parts.join("|"))
|
||||
.map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?
|
||||
};
|
||||
|
||||
let decoder: HashMap<usize, Vec<u8>> =
|
||||
encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
|
||||
|
||||
assert!(encoder.len() == decoder.len());
|
||||
|
||||
let special_tokens_decoder: HashMap<usize, Vec<u8>> = special_tokens_encoder
|
||||
.iter()
|
||||
.map(|(k, v)| (*v, k.as_bytes().to_vec()))
|
||||
.collect();
|
||||
|
||||
// Clone because I don't know how to tell Rust I'm not going to change the map
|
||||
let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
|
||||
sorted_token_bytes.sort();
|
||||
|
||||
Ok(CoreBPE {
|
||||
encoder,
|
||||
special_tokens_encoder,
|
||||
decoder,
|
||||
special_tokens_decoder,
|
||||
regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
|
||||
special_regex_tls: (0..MAX_NUM_THREADS)
|
||||
.map(|_| special_regex.clone())
|
||||
.collect(),
|
||||
sorted_token_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
fn _encode_ordinary_native(&self, text: &str) -> Vec<usize> {
|
||||
// ====================
|
||||
// Encoding
|
||||
// ====================
|
||||
|
||||
pub fn encode_ordinary(&self, text: &str) -> Vec<usize> {
|
||||
// This is the core of the encoding logic; the other functions in here
|
||||
// just make things complicated :-)
|
||||
let regex = self._get_tl_regex();
|
||||
@ -220,7 +292,7 @@ impl CoreBPE {
|
||||
ret
|
||||
}
|
||||
|
||||
fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<usize>, usize) {
|
||||
pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> (Vec<usize>, usize) {
|
||||
let special_regex = self._get_tl_special_regex();
|
||||
let regex = self._get_tl_regex();
|
||||
let mut ret = vec![];
|
||||
@ -276,51 +348,37 @@ impl CoreBPE {
|
||||
(ret, last_piece_token_len)
|
||||
}
|
||||
|
||||
fn _increase_last_piece_token_len(
|
||||
&self,
|
||||
tokens: Vec<usize>,
|
||||
mut last_piece_token_len: usize,
|
||||
) -> (Vec<usize>, usize) {
|
||||
// Unfortunately, the locations where our regex splits can be unstable.
|
||||
// For the purposes of determining unstable tokens, unstable regex splitting
|
||||
// is only a problem if a split that was present disappears, since this can
|
||||
// lead to merging of tokens otherwise thought to be stable.
|
||||
// cl100k_base makes our life hard by including the \s*[\r\n]+
|
||||
// pattern. This can e.g. cause "\n" + " " to become "\n \n".
|
||||
// Here is a quick and dirty fix:
|
||||
{
|
||||
let token_is_all_space = |token| {
|
||||
self.decoder
|
||||
.get(token)
|
||||
.map(|token_bytes| {
|
||||
token_bytes
|
||||
.iter()
|
||||
.rev()
|
||||
.all(|&b| [b' ', b'\n', b'\t'].contains(&b))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
};
|
||||
if last_piece_token_len > 0
|
||||
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
|
||||
{
|
||||
while (last_piece_token_len < tokens.len())
|
||||
&& token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
|
||||
{
|
||||
last_piece_token_len += 1;
|
||||
fn _encode_bytes(&self, bytes: &[u8]) -> Vec<usize> {
|
||||
match std::str::from_utf8(bytes) {
|
||||
Ok(text) => self.encode_ordinary(text),
|
||||
Err(e) => {
|
||||
let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
|
||||
let (tokens, last_piece_token_len) = self.encode(text, HashSet::new());
|
||||
let (mut tokens, last_piece_token_len) =
|
||||
self._increase_last_piece_token_len(tokens, last_piece_token_len);
|
||||
if !tokens.is_empty() && last_piece_token_len > 0 {
|
||||
// Lop off the tokens from the last piece and run BPE on the remaining bytes
|
||||
// Somewhat niche, but this may not be correct if we'd have had a regex
|
||||
// split between the valid UTF-8 and the invalid bytes, which is why this
|
||||
// method is private
|
||||
let mut unstable_bytes =
|
||||
self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
|
||||
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
|
||||
|
||||
tokens.truncate(tokens.len() - last_piece_token_len);
|
||||
tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
|
||||
}
|
||||
tokens
|
||||
}
|
||||
}
|
||||
debug_assert!(last_piece_token_len <= tokens.len());
|
||||
|
||||
(tokens, last_piece_token_len)
|
||||
}
|
||||
|
||||
fn _encode_unstable_native(
|
||||
pub fn encode_with_unstable(
|
||||
&self,
|
||||
text: &str,
|
||||
allowed_special: &HashSet<&str>,
|
||||
allowed_special: HashSet<&str>,
|
||||
) -> (Vec<usize>, HashSet<Vec<usize>>) {
|
||||
let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special);
|
||||
let (tokens, last_piece_token_len) = self.encode(text, allowed_special);
|
||||
if last_piece_token_len == 0 {
|
||||
// If last_piece_token_len is zero, the last token was a special token and we have
|
||||
// no unstable bytes
|
||||
@ -329,7 +387,7 @@ impl CoreBPE {
|
||||
let (mut tokens, last_piece_token_len) =
|
||||
self._increase_last_piece_token_len(tokens, last_piece_token_len);
|
||||
|
||||
let unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
|
||||
let unstable_bytes = self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
|
||||
tokens.truncate(tokens.len() - last_piece_token_len);
|
||||
|
||||
// TODO: we should try harder to find additional stable tokens
|
||||
@ -377,7 +435,7 @@ impl CoreBPE {
|
||||
// So convert to UTF-8 and do regex splitting.
|
||||
// E.g. with cl100k_base " !" gets split to " " + " !",
|
||||
// but byte_pair_encode(" !") != byte_pair_encode(" ")
|
||||
Ok(s) => self._encode_ordinary_native(s),
|
||||
Ok(s) => self.encode_ordinary(s),
|
||||
|
||||
// Technically, whether or not this arm is correct depends on whether there
|
||||
// would be a regex split before the UTF-8 truncation point.
|
||||
@ -430,108 +488,8 @@ impl CoreBPE {
|
||||
|
||||
(tokens, completions)
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl CoreBPE {
|
||||
#[new]
|
||||
fn new(
|
||||
encoder: HashMap<Vec<u8>, usize>,
|
||||
special_tokens_encoder: HashMap<String, usize>,
|
||||
pattern: &str,
|
||||
) -> PyResult<Self> {
|
||||
let regex = Regex::new(pattern)
|
||||
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))?;
|
||||
|
||||
let special_regex = {
|
||||
let _parts = special_tokens_encoder
|
||||
.keys()
|
||||
.map(|s| fancy_regex::escape(s))
|
||||
.collect::<Vec<_>>();
|
||||
Regex::new(&_parts.join("|"))
|
||||
.map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))?
|
||||
};
|
||||
|
||||
let decoder: HashMap<usize, Vec<u8>> =
|
||||
encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
|
||||
|
||||
assert!(encoder.len() == decoder.len());
|
||||
|
||||
let special_tokens_decoder: HashMap<usize, Vec<u8>> = special_tokens_encoder
|
||||
.iter()
|
||||
.map(|(k, v)| (*v, k.as_bytes().to_vec()))
|
||||
.collect();
|
||||
|
||||
// Clone because I don't know how to tell Rust I'm not going to change the map
|
||||
let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
|
||||
sorted_token_bytes.sort();
|
||||
|
||||
Ok(CoreBPE {
|
||||
encoder,
|
||||
special_tokens_encoder,
|
||||
decoder,
|
||||
special_tokens_decoder,
|
||||
regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
|
||||
special_regex_tls: (0..MAX_NUM_THREADS)
|
||||
.map(|_| special_regex.clone())
|
||||
.collect(),
|
||||
sorted_token_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
// ====================
|
||||
// Encoding
|
||||
// ====================
|
||||
|
||||
fn encode_ordinary(&self, py: Python, text: &str) -> Vec<usize> {
|
||||
py.allow_threads(|| self._encode_ordinary_native(text))
|
||||
}
|
||||
|
||||
fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec<usize> {
|
||||
py.allow_threads(|| self._encode_native(text, &allowed_special).0)
|
||||
}
|
||||
|
||||
fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<usize> {
|
||||
py.allow_threads(|| {
|
||||
match std::str::from_utf8(bytes) {
|
||||
Ok(text) => self._encode_ordinary_native(text),
|
||||
Err(e) => {
|
||||
let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
|
||||
let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new());
|
||||
let (mut tokens, last_piece_token_len) =
|
||||
self._increase_last_piece_token_len(tokens, last_piece_token_len);
|
||||
if !tokens.is_empty() && last_piece_token_len > 0 {
|
||||
// Lop off the tokens from the last piece and run BPE on the remaining bytes
|
||||
// Somewhat niche, but this may not be correct if we'd have had a regex
|
||||
// split between the valid UTF-8 and the invalid bytes, which is why this
|
||||
// method is private
|
||||
let mut unstable_bytes =
|
||||
self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
|
||||
unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
|
||||
|
||||
tokens.truncate(tokens.len() - last_piece_token_len);
|
||||
tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
|
||||
}
|
||||
tokens
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn encode_with_unstable(
|
||||
&self,
|
||||
py: Python,
|
||||
text: &str,
|
||||
allowed_special: HashSet<&str>,
|
||||
) -> Py<PyTuple> {
|
||||
let (tokens, completions) =
|
||||
py.allow_threads(|| self._encode_unstable_native(text, &allowed_special));
|
||||
let py_completions =
|
||||
PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..])));
|
||||
(tokens, py_completions).into_py(py)
|
||||
}
|
||||
|
||||
fn encode_single_token(&self, piece: &[u8]) -> PyResult<usize> {
|
||||
pub fn encode_single_token(&self, piece: &[u8]) -> anyhow::Result<usize> {
|
||||
if let Some(token) = self.encoder.get(piece).copied() {
|
||||
return Ok(token);
|
||||
}
|
||||
@ -540,10 +498,10 @@ impl CoreBPE {
|
||||
return Ok(token);
|
||||
}
|
||||
}
|
||||
Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))
|
||||
Err(anyhow::anyhow!("Piece {:?} not found", piece))
|
||||
}
|
||||
|
||||
fn encode_single_piece(&self, piece: &[u8]) -> Vec<usize> {
|
||||
pub fn encode_single_piece(&self, piece: &[u8]) -> Vec<usize> {
|
||||
if let Some(token) = self.encoder.get(piece) {
|
||||
return vec![*token];
|
||||
}
|
||||
@ -554,39 +512,37 @@ impl CoreBPE {
|
||||
// Decoding
|
||||
// ====================
|
||||
|
||||
fn decode_bytes(&self, py: Python, tokens: Vec<usize>) -> Py<PyBytes> {
|
||||
let bytes = py.allow_threads(|| self._decode_native(&tokens));
|
||||
PyBytes::new(py, &bytes).into()
|
||||
pub fn decode_bytes(&self, tokens: &[usize]) -> Vec<u8> {
|
||||
let mut ret = Vec::with_capacity(tokens.len() * 2);
|
||||
for token in tokens {
|
||||
let token_bytes = self
|
||||
.decoder
|
||||
.get(token)
|
||||
.unwrap_or_else(|| &self.special_tokens_decoder[token]);
|
||||
ret.extend(token_bytes);
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult<Py<PyBytes>> {
|
||||
pub fn decode_single_token_bytes(&self, token: usize) -> anyhow::Result<&Vec<u8>> {
|
||||
if let Some(bytes) = self.decoder.get(&token) {
|
||||
return Ok(PyBytes::new(py, bytes).into());
|
||||
return Ok(bytes);
|
||||
}
|
||||
if let Some(bytes) = self.special_tokens_decoder.get(&token) {
|
||||
return Ok(PyBytes::new(py, bytes).into());
|
||||
return Ok(bytes);
|
||||
}
|
||||
Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))
|
||||
Err(anyhow::anyhow!("Token {} not found", token))
|
||||
}
|
||||
|
||||
// ====================
|
||||
// Miscellaneous
|
||||
// ====================
|
||||
|
||||
fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {
|
||||
self.sorted_token_bytes
|
||||
.iter()
|
||||
.map(|x| PyBytes::new(py, x).into())
|
||||
.collect()
|
||||
pub fn token_byte_values(&self) -> &Vec<Vec<u8>> {
|
||||
&self.sorted_token_bytes
|
||||
}
|
||||
}
|
||||
|
||||
#[pymodule]
|
||||
fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<CoreBPE>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rustc_hash::FxHashMap as HashMap;
|
||||
|
@ -1,42 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import tiktoken
|
||||
|
||||
|
||||
def test_simple():
|
||||
# Note that there are more actual tests, they're just not currently public :-)
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
assert enc.encode("hello world") == [31373, 995]
|
||||
assert enc.decode([31373, 995]) == "hello world"
|
||||
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
|
||||
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
assert enc.encode("hello world") == [15339, 1917]
|
||||
assert enc.decode([15339, 1917]) == "hello world"
|
||||
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
|
||||
|
||||
for enc_name in tiktoken.list_encoding_names():
|
||||
enc = tiktoken.get_encoding(enc_name)
|
||||
for token in range(10_000):
|
||||
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
|
||||
|
||||
|
||||
def test_encoding_for_model():
|
||||
enc = tiktoken.encoding_for_model("gpt2")
|
||||
assert enc.name == "gpt2"
|
||||
enc = tiktoken.encoding_for_model("text-davinci-003")
|
||||
assert enc.name == "p50k_base"
|
||||
enc = tiktoken.encoding_for_model("text-davinci-edit-001")
|
||||
assert enc.name == "p50k_edit"
|
||||
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
||||
assert enc.name == "cl100k_base"
|
||||
|
||||
|
||||
def test_optional_blobfile_dependency():
|
||||
prog = """
|
||||
import tiktoken
|
||||
import sys
|
||||
assert "blobfile" not in sys.modules
|
||||
"""
|
||||
subprocess.check_call([sys.executable, "-c", prog])
|
@ -1,4 +0,0 @@
|
||||
from .core import Encoding as Encoding
|
||||
from .model import encoding_for_model as encoding_for_model
|
||||
from .registry import get_encoding as get_encoding
|
||||
from .registry import list_encoding_names as list_encoding_names
|
329
tiktoken/core.py
329
tiktoken/core.py
@ -1,329 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
|
||||
|
||||
import regex
|
||||
|
||||
from tiktoken import _tiktoken
|
||||
|
||||
|
||||
class Encoding:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
*,
|
||||
pat_str: str,
|
||||
mergeable_ranks: dict[bytes, int],
|
||||
special_tokens: dict[str, int],
|
||||
explicit_n_vocab: Optional[int] = None,
|
||||
):
|
||||
"""Creates an Encoding object.
|
||||
|
||||
See openai_public.py for examples of how to construct an Encoding object.
|
||||
|
||||
Args:
|
||||
name: The name of the encoding. It should be clear from the name of the encoding
|
||||
what behaviour to expect, in particular, encodings with different special tokens
|
||||
should have different names.
|
||||
pat_str: A regex pattern string that is used to split the input text.
|
||||
mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
|
||||
must correspond to merge priority.
|
||||
special_tokens: A dictionary mapping special token strings to their token values.
|
||||
explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
|
||||
that the number of mergeable tokens and special tokens is equal to this number.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
self._pat_str = pat_str
|
||||
self._mergeable_ranks = mergeable_ranks
|
||||
self._special_tokens = special_tokens
|
||||
|
||||
self.max_token_value = max(
|
||||
max(mergeable_ranks.values()), max(special_tokens.values(), default=0)
|
||||
)
|
||||
if explicit_n_vocab:
|
||||
assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
|
||||
assert self.max_token_value == explicit_n_vocab - 1
|
||||
|
||||
self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Encoding {self.name!r}>"
|
||||
|
||||
# ====================
|
||||
# Encoding
|
||||
# ====================
|
||||
|
||||
def encode_ordinary(self, text: str) -> list[int]:
|
||||
"""Encodes a string into tokens, ignoring special tokens.
|
||||
|
||||
This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).
|
||||
|
||||
```
|
||||
>>> enc.encode_ordinary("hello world")
|
||||
[31373, 995]
|
||||
"""
|
||||
return self._core_bpe.encode_ordinary(text)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
) -> list[int]:
|
||||
"""Encodes a string into tokens.
|
||||
|
||||
Special tokens are artificial tokens used to unlock capabilities from a model,
|
||||
such as fill-in-the-middle. So we want to be careful about accidentally encoding special
|
||||
tokens, since they can be used to trick a model into doing something we don't want it to do.
|
||||
|
||||
Hence, by default, encode will raise an error if it encounters text that corresponds
|
||||
to a special token. This can be controlled on a per-token level using the `allowed_special`
|
||||
and `disallowed_special` parameters. In particular:
|
||||
- Setting `disallowed_special` to () will prevent this function from raising errors and
|
||||
cause all text corresponding to special tokens to be encoded as natural text.
|
||||
- Setting `allowed_special` to "all" will cause this function to treat all text
|
||||
corresponding to special tokens to be encoded as special tokens.
|
||||
|
||||
```
|
||||
>>> enc.encode("hello world")
|
||||
[31373, 995]
|
||||
>>> enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})
|
||||
[50256]
|
||||
>>> enc.encode("<|endoftext|>", allowed_special="all")
|
||||
[50256]
|
||||
>>> enc.encode("<|endoftext|>")
|
||||
# Raises ValueError
|
||||
>>> enc.encode("<|endoftext|>", disallowed_special=())
|
||||
[27, 91, 437, 1659, 5239, 91, 29]
|
||||
```
|
||||
"""
|
||||
if allowed_special == "all":
|
||||
allowed_special = self.special_tokens_set
|
||||
if disallowed_special == "all":
|
||||
disallowed_special = self.special_tokens_set - allowed_special
|
||||
if disallowed_special:
|
||||
if not isinstance(disallowed_special, frozenset):
|
||||
disallowed_special = frozenset(disallowed_special)
|
||||
if match := _special_token_regex(disallowed_special).search(text):
|
||||
raise_disallowed_special_token(match.group())
|
||||
|
||||
return self._core_bpe.encode(text, allowed_special)
|
||||
|
||||
def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
|
||||
"""Encodes a list of strings into tokens, in parallel, ignoring special tokens.
|
||||
|
||||
This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).
|
||||
|
||||
```
|
||||
>>> enc.encode_ordinary_batch(["hello world", "goodbye world"])
|
||||
[[31373, 995], [11274, 16390, 995]]
|
||||
```
|
||||
"""
|
||||
encoder = functools.partial(self.encode_ordinary)
|
||||
with ThreadPoolExecutor(num_threads) as e:
|
||||
return list(e.map(encoder, text))
|
||||
|
||||
def encode_batch(
|
||||
self,
|
||||
text: list[str],
|
||||
*,
|
||||
num_threads: int = 8,
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
) -> list[list[int]]:
|
||||
"""Encodes a list of strings into tokens, in parallel.
|
||||
|
||||
See `encode` for more details on `allowed_special` and `disallowed_special`.
|
||||
|
||||
```
|
||||
>>> enc.encode_batch(["hello world", "goodbye world"])
|
||||
[[31373, 995], [11274, 16390, 995]]
|
||||
```
|
||||
"""
|
||||
if allowed_special == "all":
|
||||
allowed_special = self.special_tokens_set
|
||||
if disallowed_special == "all":
|
||||
disallowed_special = self.special_tokens_set - allowed_special
|
||||
if not isinstance(disallowed_special, frozenset):
|
||||
disallowed_special = frozenset(disallowed_special)
|
||||
|
||||
encoder = functools.partial(
|
||||
self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special
|
||||
)
|
||||
with ThreadPoolExecutor(num_threads) as e:
|
||||
return list(e.map(encoder, text))
|
||||
|
||||
def encode_with_unstable(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
|
||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
||||
) -> tuple[list[int], list[list[int]]]:
|
||||
"""Encodes a string into stable tokens and possible completion sequences.
|
||||
|
||||
Note that the stable tokens will only represent a substring of `text`.
|
||||
|
||||
See `encode` for more details on `allowed_special` and `disallowed_special`.
|
||||
|
||||
This API should itself be considered unstable.
|
||||
|
||||
```
|
||||
>>> enc.encode_with_unstable("hello fanta")
|
||||
([31373], [(277, 4910), (5113, 265), ..., (8842,)])
|
||||
|
||||
>>> text = "..."
|
||||
>>> stable_tokens, completions = enc.encode_with_unstable(text)
|
||||
>>> assert text.encode().startswith(enc.decode_bytes(stable_tokens))
|
||||
>>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)
|
||||
```
|
||||
"""
|
||||
if allowed_special == "all":
|
||||
allowed_special = self.special_tokens_set
|
||||
if disallowed_special == "all":
|
||||
disallowed_special = self.special_tokens_set - allowed_special
|
||||
if disallowed_special:
|
||||
if not isinstance(disallowed_special, frozenset):
|
||||
disallowed_special = frozenset(disallowed_special)
|
||||
if match := _special_token_regex(disallowed_special).search(text):
|
||||
raise_disallowed_special_token(match.group())
|
||||
|
||||
return self._core_bpe.encode_with_unstable(text, allowed_special)
|
||||
|
||||
def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int:
|
||||
"""Encodes text corresponding to a single token to its token value.
|
||||
|
||||
NOTE: this will encode all special tokens.
|
||||
|
||||
Raises `KeyError` if the token is not in the vocabulary.
|
||||
|
||||
```
|
||||
>>> enc.encode_single_token("hello")
|
||||
31373
|
||||
```
|
||||
"""
|
||||
if isinstance(text_or_bytes, str):
|
||||
text_or_bytes = text_or_bytes.encode("utf-8")
|
||||
return self._core_bpe.encode_single_token(text_or_bytes)
|
||||
|
||||
# ====================
|
||||
# Decoding
|
||||
# ====================
|
||||
|
||||
def decode_bytes(self, tokens: list[int]) -> bytes:
|
||||
"""Decodes a list of tokens into bytes.
|
||||
|
||||
```
|
||||
>>> enc.decode_bytes([31373, 995])
|
||||
b'hello world'
|
||||
```
|
||||
"""
|
||||
return self._core_bpe.decode_bytes(tokens)
|
||||
|
||||
def decode(self, tokens: list[int], errors: str = "replace") -> str:
|
||||
"""Decodes a list of tokens into a string.
|
||||
|
||||
WARNING: the default behaviour of this function is lossy, since decoded bytes are not
|
||||
guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter,
|
||||
for instance, setting `errors=strict`.
|
||||
|
||||
```
|
||||
>>> enc.decode([31373, 995])
|
||||
'hello world'
|
||||
```
|
||||
"""
|
||||
return self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
||||
|
||||
def decode_single_token_bytes(self, token: int) -> bytes:
|
||||
"""Decodes a token into bytes.
|
||||
|
||||
NOTE: this will decode all special tokens.
|
||||
|
||||
Raises `KeyError` if the token is not in the vocabulary.
|
||||
|
||||
```
|
||||
>>> enc.decode_single_token_bytes(31373)
|
||||
b'hello'
|
||||
```
|
||||
"""
|
||||
return self._core_bpe.decode_single_token_bytes(token)
|
||||
|
||||
def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
|
||||
"""Decodes a list of tokens into a list of bytes.
|
||||
|
||||
Useful for visualising tokenisation.
|
||||
>>> enc.decode_tokens_bytes([31373, 995])
|
||||
[b'hello', b' world']
|
||||
"""
|
||||
return [self.decode_single_token_bytes(token) for token in tokens]
|
||||
|
||||
# ====================
|
||||
# Miscellaneous
|
||||
# ====================
|
||||
|
||||
def token_byte_values(self) -> list[bytes]:
|
||||
"""Returns the list of all token byte values."""
|
||||
return self._core_bpe.token_byte_values()
|
||||
|
||||
@property
|
||||
def eot_token(self) -> int:
|
||||
return self._special_tokens["<|endoftext|>"]
|
||||
|
||||
@functools.cached_property
|
||||
def special_tokens_set(self) -> set[str]:
|
||||
return set(self._special_tokens.keys())
|
||||
|
||||
@property
|
||||
def n_vocab(self) -> int:
|
||||
"""For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
|
||||
return self.max_token_value + 1
|
||||
|
||||
# ====================
|
||||
# Private
|
||||
# ====================
|
||||
|
||||
def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
|
||||
"""Encodes text corresponding to bytes without a regex split.
|
||||
|
||||
NOTE: this will not encode any special tokens.
|
||||
|
||||
```
|
||||
>>> enc.encode_single_piece("helloqqqq")
|
||||
[31373, 38227, 38227]
|
||||
```
|
||||
"""
|
||||
if isinstance(text_or_bytes, str):
|
||||
text_or_bytes = text_or_bytes.encode("utf-8")
|
||||
return self._core_bpe.encode_single_piece(text_or_bytes)
|
||||
|
||||
def _encode_only_native_bpe(self, text: str) -> list[int]:
|
||||
"""Encodes a string into tokens, but do regex splitting in Python."""
|
||||
_unused_pat = regex.compile(self._pat_str)
|
||||
ret = []
|
||||
for piece in regex.findall(_unused_pat, text):
|
||||
ret.extend(self._core_bpe.encode_single_piece(piece))
|
||||
return ret
|
||||
|
||||
def _encode_bytes(self, text: bytes) -> list[int]:
|
||||
return self._core_bpe._encode_bytes(text)
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
|
||||
inner = "|".join(regex.escape(token) for token in tokens)
|
||||
return regex.compile(f"({inner})")
|
||||
|
||||
|
||||
def raise_disallowed_special_token(token: str) -> NoReturn:
|
||||
raise ValueError(
|
||||
f"Encountered text corresponding to disallowed special token {token!r}.\n"
|
||||
"If you want this text to be encoded as a special token, "
|
||||
f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n"
|
||||
f"If you want this text to be encoded as normal text, disable the check for this token "
|
||||
f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n"
|
||||
"To disable this check for all special tokens, pass `disallowed_special=()`.\n"
|
||||
)
|
118
tiktoken/load.py
118
tiktoken/load.py
@ -1,118 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def read_file(blobpath: str) -> bytes:
|
||||
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
|
||||
try:
|
||||
import blobfile
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"blobfile is not installed. Please install it by running `pip install blobfile`."
|
||||
)
|
||||
with blobfile.BlobFile(blobpath, "rb") as f:
|
||||
return f.read()
|
||||
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
|
||||
return requests.get(blobpath).content
|
||||
|
||||
|
||||
def read_file_cached(blobpath: str) -> bytes:
|
||||
if "TIKTOKEN_CACHE_DIR" in os.environ:
|
||||
cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
|
||||
elif "DATA_GYM_CACHE_DIR" in os.environ:
|
||||
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
|
||||
else:
|
||||
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||
|
||||
if cache_dir == "":
|
||||
# disable caching
|
||||
return read_file(blobpath)
|
||||
|
||||
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
|
||||
|
||||
cache_path = os.path.join(cache_dir, cache_key)
|
||||
if os.path.exists(cache_path):
|
||||
with open(cache_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
contents = read_file(blobpath)
|
||||
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
|
||||
with open(tmp_filename, "wb") as f:
|
||||
f.write(contents)
|
||||
os.rename(tmp_filename, cache_path)
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def data_gym_to_mergeable_bpe_ranks(
|
||||
vocab_bpe_file: str, encoder_json_file: str
|
||||
) -> dict[bytes, int]:
|
||||
# NB: do not add caching to this function
|
||||
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
|
||||
|
||||
data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte}
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in rank_to_intbyte:
|
||||
rank_to_intbyte.append(b)
|
||||
data_gym_byte_to_byte[chr(2**8 + n)] = b
|
||||
n += 1
|
||||
assert len(rank_to_intbyte) == 2**8
|
||||
|
||||
# vocab_bpe contains the merges along with associated ranks
|
||||
vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode()
|
||||
bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
|
||||
|
||||
def decode_data_gym(value: str) -> bytes:
|
||||
return bytes(data_gym_byte_to_byte[b] for b in value)
|
||||
|
||||
# add the single byte tokens
|
||||
bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}
|
||||
# add the merged tokens
|
||||
n = len(bpe_ranks)
|
||||
for first, second in bpe_merges:
|
||||
bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n
|
||||
n += 1
|
||||
|
||||
# check that the encoder file matches the merges file
|
||||
# this sanity check is important since tiktoken assumes that ranks are ordered the same
|
||||
# as merge priority
|
||||
encoder_json = json.loads(read_file_cached(encoder_json_file))
|
||||
encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
|
||||
# drop these two special tokens if present, since they're not mergeable bpe tokens
|
||||
encoder_json_loaded.pop(b"<|endoftext|>", None)
|
||||
encoder_json_loaded.pop(b"<|startoftext|>", None)
|
||||
assert bpe_ranks == encoder_json_loaded
|
||||
|
||||
return bpe_ranks
|
||||
|
||||
|
||||
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
|
||||
try:
|
||||
import blobfile
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"blobfile is not installed. Please install it by running `pip install blobfile`."
|
||||
)
|
||||
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
|
||||
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
|
||||
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
|
||||
|
||||
|
||||
def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
|
||||
# NB: do not add caching to this function
|
||||
contents = read_file_cached(tiktoken_bpe_file)
|
||||
return {
|
||||
base64.b64decode(token): int(rank)
|
||||
for token, rank in (line.split() for line in contents.splitlines() if line)
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .core import Encoding
|
||||
from .registry import get_encoding
|
||||
|
||||
# TODO: these will likely be replaced by an API endpoint
|
||||
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||
}
|
||||
|
||||
MODEL_TO_ENCODING: dict[str, str] = {
|
||||
# chat
|
||||
"gpt-4": "cl100k_base",
|
||||
"gpt-3.5-turbo": "cl100k_base",
|
||||
# text
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
"text-davinci-001": "r50k_base",
|
||||
"text-curie-001": "r50k_base",
|
||||
"text-babbage-001": "r50k_base",
|
||||
"text-ada-001": "r50k_base",
|
||||
"davinci": "r50k_base",
|
||||
"curie": "r50k_base",
|
||||
"babbage": "r50k_base",
|
||||
"ada": "r50k_base",
|
||||
# code
|
||||
"code-davinci-002": "p50k_base",
|
||||
"code-davinci-001": "p50k_base",
|
||||
"code-cushman-002": "p50k_base",
|
||||
"code-cushman-001": "p50k_base",
|
||||
"davinci-codex": "p50k_base",
|
||||
"cushman-codex": "p50k_base",
|
||||
# edit
|
||||
"text-davinci-edit-001": "p50k_edit",
|
||||
"code-davinci-edit-001": "p50k_edit",
|
||||
# embeddings
|
||||
"text-embedding-ada-002": "cl100k_base",
|
||||
# old embeddings
|
||||
"text-similarity-davinci-001": "r50k_base",
|
||||
"text-similarity-curie-001": "r50k_base",
|
||||
"text-similarity-babbage-001": "r50k_base",
|
||||
"text-similarity-ada-001": "r50k_base",
|
||||
"text-search-davinci-doc-001": "r50k_base",
|
||||
"text-search-curie-doc-001": "r50k_base",
|
||||
"text-search-babbage-doc-001": "r50k_base",
|
||||
"text-search-ada-doc-001": "r50k_base",
|
||||
"code-search-babbage-code-001": "r50k_base",
|
||||
"code-search-ada-code-001": "r50k_base",
|
||||
# open source
|
||||
"gpt2": "gpt2",
|
||||
}
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model."""
|
||||
encoding_name = None
|
||||
if model_name in MODEL_TO_ENCODING:
|
||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||
else:
|
||||
# Check if the model matches a known prefix
|
||||
# Prefix matching avoids needing library updates for every model version release
|
||||
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
|
||||
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
|
||||
if model_name.startswith(model_prefix):
|
||||
return get_encoding(model_encoding_name)
|
||||
|
||||
if encoding_name is None:
|
||||
raise KeyError(
|
||||
f"Could not automatically map {model_name} to a tokeniser. "
|
||||
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
|
||||
return get_encoding(encoding_name)
|
@ -1,73 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import pkgutil
|
||||
import threading
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import tiktoken_ext
|
||||
|
||||
from tiktoken.core import Encoding
|
||||
|
||||
_lock = threading.RLock()
|
||||
ENCODINGS: dict[str, Encoding] = {}
|
||||
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
|
||||
|
||||
|
||||
def _find_constructors() -> None:
|
||||
global ENCODING_CONSTRUCTORS
|
||||
with _lock:
|
||||
if ENCODING_CONSTRUCTORS is not None:
|
||||
return
|
||||
ENCODING_CONSTRUCTORS = {}
|
||||
|
||||
# tiktoken_ext is a namespace package
|
||||
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
|
||||
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
|
||||
# - it's a separate top-level package because namespace subpackages of non-namespace
|
||||
# packages don't quite do what you want with editable installs
|
||||
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
|
||||
|
||||
for _, mod_name, _ in plugin_mods:
|
||||
mod = importlib.import_module(mod_name)
|
||||
try:
|
||||
constructors = mod.ENCODING_CONSTRUCTORS
|
||||
except AttributeError as e:
|
||||
raise ValueError(
|
||||
f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS"
|
||||
) from e
|
||||
for enc_name, constructor in constructors.items():
|
||||
if enc_name in ENCODING_CONSTRUCTORS:
|
||||
raise ValueError(
|
||||
f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}"
|
||||
)
|
||||
ENCODING_CONSTRUCTORS[enc_name] = constructor
|
||||
|
||||
|
||||
def get_encoding(encoding_name: str) -> Encoding:
|
||||
if encoding_name in ENCODINGS:
|
||||
return ENCODINGS[encoding_name]
|
||||
|
||||
with _lock:
|
||||
if encoding_name in ENCODINGS:
|
||||
return ENCODINGS[encoding_name]
|
||||
|
||||
if ENCODING_CONSTRUCTORS is None:
|
||||
_find_constructors()
|
||||
assert ENCODING_CONSTRUCTORS is not None
|
||||
|
||||
if encoding_name not in ENCODING_CONSTRUCTORS:
|
||||
raise ValueError(f"Unknown encoding {encoding_name}")
|
||||
|
||||
constructor = ENCODING_CONSTRUCTORS[encoding_name]
|
||||
enc = Encoding(**constructor())
|
||||
ENCODINGS[encoding_name] = enc
|
||||
return enc
|
||||
|
||||
|
||||
def list_encoding_names() -> list[str]:
|
||||
with _lock:
|
||||
if ENCODING_CONSTRUCTORS is None:
|
||||
_find_constructors()
|
||||
assert ENCODING_CONSTRUCTORS is not None
|
||||
return list(ENCODING_CONSTRUCTORS)
|
@ -1,88 +0,0 @@
|
||||
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
|
||||
|
||||
ENDOFTEXT = "<|endoftext|>"
|
||||
FIM_PREFIX = "<|fim_prefix|>"
|
||||
FIM_MIDDLE = "<|fim_middle|>"
|
||||
FIM_SUFFIX = "<|fim_suffix|>"
|
||||
ENDOFPROMPT = "<|endofprompt|>"
|
||||
|
||||
|
||||
def gpt2():
|
||||
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
||||
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
|
||||
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
|
||||
)
|
||||
return {
|
||||
"name": "gpt2",
|
||||
"explicit_n_vocab": 50257,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {"<|endoftext|>": 50256},
|
||||
}
|
||||
|
||||
|
||||
def r50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||
)
|
||||
return {
|
||||
"name": "r50k_base",
|
||||
"explicit_n_vocab": 50257,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
|
||||
|
||||
def p50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
)
|
||||
return {
|
||||
"name": "p50k_base",
|
||||
"explicit_n_vocab": 50281,
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": {ENDOFTEXT: 50256},
|
||||
}
|
||||
|
||||
|
||||
def p50k_edit():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
)
|
||||
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
|
||||
return {
|
||||
"name": "p50k_edit",
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
|
||||
|
||||
def cl100k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
)
|
||||
special_tokens = {
|
||||
ENDOFTEXT: 100257,
|
||||
FIM_PREFIX: 100258,
|
||||
FIM_MIDDLE: 100259,
|
||||
FIM_SUFFIX: 100260,
|
||||
ENDOFPROMPT: 100276,
|
||||
}
|
||||
return {
|
||||
"name": "cl100k_base",
|
||||
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
|
||||
|
||||
ENCODING_CONSTRUCTORS = {
|
||||
"gpt2": gpt2,
|
||||
"r50k_base": r50k_base,
|
||||
"p50k_base": p50k_base,
|
||||
"p50k_edit": p50k_edit,
|
||||
"cl100k_base": cl100k_base,
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user