Bump version, sync codebase

This commit is contained in:
Shantanu Jain 2023-02-03 11:46:09 -08:00
parent 156eff92d2
commit 7830ed537b
12 changed files with 175 additions and 60 deletions

View File

@ -2,6 +2,10 @@
This is the changelog for the open source version of tiktoken.
## [v0.2.0]
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
- Improve portability of caching logic
## [v0.1.2]
- Avoid use of `blobfile` for public files
- Add support for Python 3.8

View File

@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.1.0"
version = "0.2.0"
edition = "2021"
rust-version = "1.57.0"

View File

@ -1,49 +0,0 @@
PROJECT := tiktoken
.PHONY: default
default: editable_install
.PHONY: install_rust
install_rust:
which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62
.PHONY: clean
clean:
cargo clean
pip uninstall -y $(PROJECT)
find . | grep -E '__pycache__|\.pyc' | xargs rm -rf
find . | grep -E '\.so' | xargs rm -rf
rm -rf dist/ build/
rm -rf $(PROJECT).egg-info/
.PHONY: format
format:
@ which black >/dev/null || python3 -m pip install black
@ which isort >/dev/null || python3 -m pip install isort
cargo fmt -- --config group_imports=StdExternalCrate
black --line-length 100 --skip-magic-trailing-comma --quiet .
isort --line-length 100 --profile black --quiet .
.PHONY: format_check
format_check:
@ which black >/dev/null || python3 -m pip install black
@ which isort >/dev/null || python3 -m pip install isort
cargo fmt --check -- --config group_imports=StdExternalCrate
black --check --line-length 100 --skip-magic-trailing-comma --quiet .
isort --check --line-length 100 --profile black --quiet .
.PHONY: lint
lint:
cargo clippy --all -- -D warnings
@ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11
flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build .
.PHONY: editable_install
editable_install:
@ if [ -f $(PROJECT).egg-info ]; then \
pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \
pip install --disable-pip-version-check --no-build-isolation -e . ; \
else \
pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \
fi

View File

@ -7,6 +7,9 @@ OpenAI's models.
import tiktoken
enc = tiktoken.get_encoding("gpt2")
assert enc.decode(enc.encode("hello world")) == "hello world"
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("text-davinci-003")
```
The open source version of `tiktoken` can be installed from PyPI:
@ -16,7 +19,9 @@ pip install tiktoken
The tokeniser API is documented in `tiktoken/core.py`.
Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
Example code using `tiktoken` can be found in the
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
## Performance
@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni
`tokenizers==0.13.2` and `transformers==4.24.0`.
## Getting help
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
@shantanu.
## Extending tiktoken
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
**Create your `Encoding` object exactly the way you want and simply pass it around.**
```python
cl100k_base = tiktoken.get_encoding("cl100k_base")
# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
enc = tiktoken.Encoding(
# If you're changing the set of special tokens, make sure to use a different name
# It should be clear from the name what behaviour to expect.
name="cl100k_im",
pat_str=cl100k_base._pat_str,
mergeable_ranks=cl100k_base._mergeable_ranks,
special_tokens={
**cl100k_base._special_tokens,
"<|im_start|>": 100264,
"<|im_end|>": 100265,
}
)
```
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
option 1.
To do this, you'll need to create a namespace package under `tiktoken_ext`.
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
```
my_tiktoken_extension
├── tiktoken_ext
│   └── my_encodings.py
└── setup.py
```
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
This is a dictionary from an encoding name to a function that takes no arguments and returns
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
Your `setup.py` should look something like this:
```python
from setuptools import setup, find_namespace_packages
setup(
name="my_tiktoken_extension",
packages=find_namespace_packages(include=['tiktoken_ext.*'])
install_requires=["tiktoken"],
...
)
```
Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings!
Make sure **not** to use an editable install.

View File

@ -1,12 +1,12 @@
[project]
name = "tiktoken"
version = "0.2.0"
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
dynamic = ["version"]
requires-python = ">=3.8"
[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
[tool.cibuildwheel]
build-frontend = "build"

View File

@ -1,14 +1,8 @@
from setuptools import setup
from setuptools_rust import Binding, RustExtension
public = True
if public:
version = "0.1.2"
setup(
name="tiktoken",
version=version,
rust_extensions=[
RustExtension(
"tiktoken._tiktoken",

View File

@ -17,3 +17,10 @@ def test_simple():
enc = tiktoken.get_encoding(enc_name)
for token in range(10_000):
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
def test_encoding_for_model():
enc = tiktoken.encoding_for_model("gpt2")
assert enc.name == "gpt2"
enc = tiktoken.encoding_for_model("text-davinci-003")
assert enc.name == "p50k_base"

View File

@ -1,3 +1,4 @@
from .core import Encoding as Encoding
from .model import encoding_for_model as encoding_for_model
from .registry import get_encoding as get_encoding
from .registry import list_encoding_names as list_encoding_names

View File

@ -19,6 +19,21 @@ class Encoding:
special_tokens: dict[str, int],
explicit_n_vocab: Optional[int] = None,
):
"""Creates an Encoding object.
See openai_public.py for examples of how to construct an Encoding object.
Args:
name: The name of the encoding. It should be clear from the name of the encoding
what behaviour to expect, in particular, encodings with different special tokens
should have different names.
pat_str: A regex pattern string that is used to split the input text.
mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
must correspond to merge priority.
special_tokens: A dictionary mapping special token strings to their token values.
explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
that the number of mergeable tokens and special tokens is equal to this number.
"""
self.name = name
self._pat_str = pat_str

View File

@ -4,6 +4,7 @@ import base64
import hashlib
import json
import os
import tempfile
import uuid
import blobfile
@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes:
elif "DATA_GYM_CACHE_DIR" in os.environ:
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
else:
cache_dir = "/tmp/data-gym-cache"
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
if cache_dir == "":
# disable caching

55
tiktoken/model.py Normal file
View File

@ -0,0 +1,55 @@
from __future__ import annotations
from .core import Encoding
from .registry import get_encoding
# TODO: this will likely be replaced by an API endpoint
MODEL_TO_ENCODING: dict[str, str] = {
# text
"text-davinci-003": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-001": "r50k_base",
"text-curie-001": "r50k_base",
"text-babbage-001": "r50k_base",
"text-ada-001": "r50k_base",
"davinci": "r50k_base",
"curie": "r50k_base",
"babbage": "r50k_base",
"ada": "r50k_base",
# code
"code-davinci-002": "p50k_base",
"code-davinci-001": "p50k_base",
"code-cushman-002": "p50k_base",
"code-cushman-001": "p50k_base",
"davinci-codex": "p50k_base",
"cushman-codex": "p50k_base",
# edit
"text-davinci-edit-001": "p50k_edit",
"code-davinci-edit-001": "p50k_edit",
# embeddings
"text-embedding-ada-002": "cl100k_base",
# old embeddings
"text-similarity-davinci-001": "r50k_base",
"text-similarity-curie-001": "r50k_base",
"text-similarity-babbage-001": "r50k_base",
"text-similarity-ada-001": "r50k_base",
"text-search-davinci-doc-001": "r50k_base",
"text-search-curie-doc-001": "r50k_base",
"text-search-babbage-doc-001": "r50k_base",
"text-search-ada-doc-001": "r50k_base",
"code-search-babbage-code-001": "r50k_base",
"code-search-ada-code-001": "r50k_base",
# open source
"gpt2": "gpt2",
}
def encoding_for_model(model_name: str) -> Encoding:
try:
encoding_name = MODEL_TO_ENCODING[model_name]
except KeyError:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
) from None
return get_encoding(encoding_name)

View File

@ -47,6 +47,19 @@ def p50k_base():
}
def p50k_edit():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
)
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
return {
"name": "p50k_edit",
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
def cl100k_base():
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"