Bump version, sync codebase
This commit is contained in:
parent
156eff92d2
commit
7830ed537b
@ -2,6 +2,10 @@
|
|||||||
|
|
||||||
This is the changelog for the open source version of tiktoken.
|
This is the changelog for the open source version of tiktoken.
|
||||||
|
|
||||||
|
## [v0.2.0]
|
||||||
|
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
||||||
|
- Improve portability of caching logic
|
||||||
|
|
||||||
## [v0.1.2]
|
## [v0.1.2]
|
||||||
- Avoid use of `blobfile` for public files
|
- Avoid use of `blobfile` for public files
|
||||||
- Add support for Python 3.8
|
- Add support for Python 3.8
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.57.0"
|
rust-version = "1.57.0"
|
||||||
|
|
||||||
|
49
Makefile
49
Makefile
@ -1,49 +0,0 @@
|
|||||||
PROJECT := tiktoken
|
|
||||||
|
|
||||||
.PHONY: default
|
|
||||||
default: editable_install
|
|
||||||
|
|
||||||
.PHONY: install_rust
|
|
||||||
install_rust:
|
|
||||||
which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62
|
|
||||||
|
|
||||||
.PHONY: clean
|
|
||||||
clean:
|
|
||||||
cargo clean
|
|
||||||
pip uninstall -y $(PROJECT)
|
|
||||||
find . | grep -E '__pycache__|\.pyc' | xargs rm -rf
|
|
||||||
find . | grep -E '\.so' | xargs rm -rf
|
|
||||||
rm -rf dist/ build/
|
|
||||||
rm -rf $(PROJECT).egg-info/
|
|
||||||
|
|
||||||
.PHONY: format
|
|
||||||
format:
|
|
||||||
@ which black >/dev/null || python3 -m pip install black
|
|
||||||
@ which isort >/dev/null || python3 -m pip install isort
|
|
||||||
cargo fmt -- --config group_imports=StdExternalCrate
|
|
||||||
black --line-length 100 --skip-magic-trailing-comma --quiet .
|
|
||||||
isort --line-length 100 --profile black --quiet .
|
|
||||||
|
|
||||||
|
|
||||||
.PHONY: format_check
|
|
||||||
format_check:
|
|
||||||
@ which black >/dev/null || python3 -m pip install black
|
|
||||||
@ which isort >/dev/null || python3 -m pip install isort
|
|
||||||
cargo fmt --check -- --config group_imports=StdExternalCrate
|
|
||||||
black --check --line-length 100 --skip-magic-trailing-comma --quiet .
|
|
||||||
isort --check --line-length 100 --profile black --quiet .
|
|
||||||
|
|
||||||
.PHONY: lint
|
|
||||||
lint:
|
|
||||||
cargo clippy --all -- -D warnings
|
|
||||||
@ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11
|
|
||||||
flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build .
|
|
||||||
|
|
||||||
.PHONY: editable_install
|
|
||||||
editable_install:
|
|
||||||
@ if [ -f $(PROJECT).egg-info ]; then \
|
|
||||||
pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \
|
|
||||||
pip install --disable-pip-version-check --no-build-isolation -e . ; \
|
|
||||||
else \
|
|
||||||
pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \
|
|
||||||
fi
|
|
76
README.md
76
README.md
@ -7,6 +7,9 @@ OpenAI's models.
|
|||||||
import tiktoken
|
import tiktoken
|
||||||
enc = tiktoken.get_encoding("gpt2")
|
enc = tiktoken.get_encoding("gpt2")
|
||||||
assert enc.decode(enc.encode("hello world")) == "hello world"
|
assert enc.decode(enc.encode("hello world")) == "hello world"
|
||||||
|
|
||||||
|
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
||||||
|
enc = tiktoken.encoding_for_model("text-davinci-003")
|
||||||
```
|
```
|
||||||
|
|
||||||
The open source version of `tiktoken` can be installed from PyPI:
|
The open source version of `tiktoken` can be installed from PyPI:
|
||||||
@ -16,7 +19,9 @@ pip install tiktoken
|
|||||||
|
|
||||||
The tokeniser API is documented in `tiktoken/core.py`.
|
The tokeniser API is documented in `tiktoken/core.py`.
|
||||||
|
|
||||||
Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
Example code using `tiktoken` can be found in the
|
||||||
|
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
||||||
|
|
||||||
|
|
||||||
## Performance
|
## Performance
|
||||||
|
|
||||||
@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni
|
|||||||
`tokenizers==0.13.2` and `transformers==4.24.0`.
|
`tokenizers==0.13.2` and `transformers==4.24.0`.
|
||||||
|
|
||||||
|
|
||||||
|
## Getting help
|
||||||
|
|
||||||
|
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
|
||||||
|
|
||||||
|
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
|
||||||
|
@shantanu.
|
||||||
|
|
||||||
|
|
||||||
|
## Extending tiktoken
|
||||||
|
|
||||||
|
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
|
||||||
|
|
||||||
|
|
||||||
|
**Create your `Encoding` object exactly the way you want and simply pass it around.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
cl100k_base = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
|
# In production, load the arguments directly instead of accessing private attributes
|
||||||
|
# See openai_public.py for examples of arguments for specific encodings
|
||||||
|
enc = tiktoken.Encoding(
|
||||||
|
# If you're changing the set of special tokens, make sure to use a different name
|
||||||
|
# It should be clear from the name what behaviour to expect.
|
||||||
|
name="cl100k_im",
|
||||||
|
pat_str=cl100k_base._pat_str,
|
||||||
|
mergeable_ranks=cl100k_base._mergeable_ranks,
|
||||||
|
special_tokens={
|
||||||
|
**cl100k_base._special_tokens,
|
||||||
|
"<|im_start|>": 100264,
|
||||||
|
"<|im_end|>": 100265,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
|
||||||
|
|
||||||
|
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
|
||||||
|
option 1.
|
||||||
|
|
||||||
|
To do this, you'll need to create a namespace package under `tiktoken_ext`.
|
||||||
|
|
||||||
|
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
|
||||||
|
```
|
||||||
|
my_tiktoken_extension
|
||||||
|
├── tiktoken_ext
|
||||||
|
│ └── my_encodings.py
|
||||||
|
└── setup.py
|
||||||
|
```
|
||||||
|
|
||||||
|
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
|
||||||
|
This is a dictionary from an encoding name to a function that takes no arguments and returns
|
||||||
|
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
|
||||||
|
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
|
||||||
|
|
||||||
|
Your `setup.py` should look something like this:
|
||||||
|
```python
|
||||||
|
from setuptools import setup, find_namespace_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="my_tiktoken_extension",
|
||||||
|
packages=find_namespace_packages(include=['tiktoken_ext.*'])
|
||||||
|
install_requires=["tiktoken"],
|
||||||
|
...
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings!
|
||||||
|
Make sure **not** to use an editable install.
|
||||||
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
|
version = "0.2.0"
|
||||||
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
|
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
|
||||||
dynamic = ["version"]
|
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.8"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
|
requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
|
||||||
|
|
||||||
[tool.cibuildwheel]
|
[tool.cibuildwheel]
|
||||||
build-frontend = "build"
|
build-frontend = "build"
|
||||||
|
6
setup.py
6
setup.py
@ -1,14 +1,8 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
from setuptools_rust import Binding, RustExtension
|
from setuptools_rust import Binding, RustExtension
|
||||||
|
|
||||||
public = True
|
|
||||||
|
|
||||||
if public:
|
|
||||||
version = "0.1.2"
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tiktoken",
|
name="tiktoken",
|
||||||
version=version,
|
|
||||||
rust_extensions=[
|
rust_extensions=[
|
||||||
RustExtension(
|
RustExtension(
|
||||||
"tiktoken._tiktoken",
|
"tiktoken._tiktoken",
|
||||||
|
@ -17,3 +17,10 @@ def test_simple():
|
|||||||
enc = tiktoken.get_encoding(enc_name)
|
enc = tiktoken.get_encoding(enc_name)
|
||||||
for token in range(10_000):
|
for token in range(10_000):
|
||||||
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
|
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
|
||||||
|
|
||||||
|
|
||||||
|
def test_encoding_for_model():
|
||||||
|
enc = tiktoken.encoding_for_model("gpt2")
|
||||||
|
assert enc.name == "gpt2"
|
||||||
|
enc = tiktoken.encoding_for_model("text-davinci-003")
|
||||||
|
assert enc.name == "p50k_base"
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
from .core import Encoding as Encoding
|
from .core import Encoding as Encoding
|
||||||
|
from .model import encoding_for_model as encoding_for_model
|
||||||
from .registry import get_encoding as get_encoding
|
from .registry import get_encoding as get_encoding
|
||||||
from .registry import list_encoding_names as list_encoding_names
|
from .registry import list_encoding_names as list_encoding_names
|
||||||
|
@ -19,6 +19,21 @@ class Encoding:
|
|||||||
special_tokens: dict[str, int],
|
special_tokens: dict[str, int],
|
||||||
explicit_n_vocab: Optional[int] = None,
|
explicit_n_vocab: Optional[int] = None,
|
||||||
):
|
):
|
||||||
|
"""Creates an Encoding object.
|
||||||
|
|
||||||
|
See openai_public.py for examples of how to construct an Encoding object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The name of the encoding. It should be clear from the name of the encoding
|
||||||
|
what behaviour to expect, in particular, encodings with different special tokens
|
||||||
|
should have different names.
|
||||||
|
pat_str: A regex pattern string that is used to split the input text.
|
||||||
|
mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
|
||||||
|
must correspond to merge priority.
|
||||||
|
special_tokens: A dictionary mapping special token strings to their token values.
|
||||||
|
explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
|
||||||
|
that the number of mergeable tokens and special tokens is equal to this number.
|
||||||
|
"""
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
self._pat_str = pat_str
|
self._pat_str = pat_str
|
||||||
|
@ -4,6 +4,7 @@ import base64
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import blobfile
|
import blobfile
|
||||||
@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes:
|
|||||||
elif "DATA_GYM_CACHE_DIR" in os.environ:
|
elif "DATA_GYM_CACHE_DIR" in os.environ:
|
||||||
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
|
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
|
||||||
else:
|
else:
|
||||||
cache_dir = "/tmp/data-gym-cache"
|
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||||
|
|
||||||
if cache_dir == "":
|
if cache_dir == "":
|
||||||
# disable caching
|
# disable caching
|
||||||
|
55
tiktoken/model.py
Normal file
55
tiktoken/model.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .core import Encoding
|
||||||
|
from .registry import get_encoding
|
||||||
|
|
||||||
|
# TODO: this will likely be replaced by an API endpoint
|
||||||
|
MODEL_TO_ENCODING: dict[str, str] = {
|
||||||
|
# text
|
||||||
|
"text-davinci-003": "p50k_base",
|
||||||
|
"text-davinci-002": "p50k_base",
|
||||||
|
"text-davinci-001": "r50k_base",
|
||||||
|
"text-curie-001": "r50k_base",
|
||||||
|
"text-babbage-001": "r50k_base",
|
||||||
|
"text-ada-001": "r50k_base",
|
||||||
|
"davinci": "r50k_base",
|
||||||
|
"curie": "r50k_base",
|
||||||
|
"babbage": "r50k_base",
|
||||||
|
"ada": "r50k_base",
|
||||||
|
# code
|
||||||
|
"code-davinci-002": "p50k_base",
|
||||||
|
"code-davinci-001": "p50k_base",
|
||||||
|
"code-cushman-002": "p50k_base",
|
||||||
|
"code-cushman-001": "p50k_base",
|
||||||
|
"davinci-codex": "p50k_base",
|
||||||
|
"cushman-codex": "p50k_base",
|
||||||
|
# edit
|
||||||
|
"text-davinci-edit-001": "p50k_edit",
|
||||||
|
"code-davinci-edit-001": "p50k_edit",
|
||||||
|
# embeddings
|
||||||
|
"text-embedding-ada-002": "cl100k_base",
|
||||||
|
# old embeddings
|
||||||
|
"text-similarity-davinci-001": "r50k_base",
|
||||||
|
"text-similarity-curie-001": "r50k_base",
|
||||||
|
"text-similarity-babbage-001": "r50k_base",
|
||||||
|
"text-similarity-ada-001": "r50k_base",
|
||||||
|
"text-search-davinci-doc-001": "r50k_base",
|
||||||
|
"text-search-curie-doc-001": "r50k_base",
|
||||||
|
"text-search-babbage-doc-001": "r50k_base",
|
||||||
|
"text-search-ada-doc-001": "r50k_base",
|
||||||
|
"code-search-babbage-code-001": "r50k_base",
|
||||||
|
"code-search-ada-code-001": "r50k_base",
|
||||||
|
# open source
|
||||||
|
"gpt2": "gpt2",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def encoding_for_model(model_name: str) -> Encoding:
|
||||||
|
try:
|
||||||
|
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||||
|
except KeyError:
|
||||||
|
raise KeyError(
|
||||||
|
f"Could not automatically map {model_name} to a tokeniser. "
|
||||||
|
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
||||||
|
) from None
|
||||||
|
return get_encoding(encoding_name)
|
@ -47,6 +47,19 @@ def p50k_base():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def p50k_edit():
|
||||||
|
mergeable_ranks = load_tiktoken_bpe(
|
||||||
|
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||||
|
)
|
||||||
|
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
|
||||||
|
return {
|
||||||
|
"name": "p50k_edit",
|
||||||
|
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||||
|
"mergeable_ranks": mergeable_ranks,
|
||||||
|
"special_tokens": special_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def cl100k_base():
|
def cl100k_base():
|
||||||
mergeable_ranks = load_tiktoken_bpe(
|
mergeable_ranks = load_tiktoken_bpe(
|
||||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user