Bump version, sync codebase
This commit is contained in:
parent
156eff92d2
commit
7830ed537b
@ -2,6 +2,10 @@
|
||||
|
||||
This is the changelog for the open source version of tiktoken.
|
||||
|
||||
## [v0.2.0]
|
||||
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
||||
- Improve portability of caching logic
|
||||
|
||||
## [v0.1.2]
|
||||
- Avoid use of `blobfile` for public files
|
||||
- Add support for Python 3.8
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tiktoken"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
rust-version = "1.57.0"
|
||||
|
||||
|
49
Makefile
49
Makefile
@ -1,49 +0,0 @@
|
||||
PROJECT := tiktoken
|
||||
|
||||
.PHONY: default
|
||||
default: editable_install
|
||||
|
||||
.PHONY: install_rust
|
||||
install_rust:
|
||||
which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cargo clean
|
||||
pip uninstall -y $(PROJECT)
|
||||
find . | grep -E '__pycache__|\.pyc' | xargs rm -rf
|
||||
find . | grep -E '\.so' | xargs rm -rf
|
||||
rm -rf dist/ build/
|
||||
rm -rf $(PROJECT).egg-info/
|
||||
|
||||
.PHONY: format
|
||||
format:
|
||||
@ which black >/dev/null || python3 -m pip install black
|
||||
@ which isort >/dev/null || python3 -m pip install isort
|
||||
cargo fmt -- --config group_imports=StdExternalCrate
|
||||
black --line-length 100 --skip-magic-trailing-comma --quiet .
|
||||
isort --line-length 100 --profile black --quiet .
|
||||
|
||||
|
||||
.PHONY: format_check
|
||||
format_check:
|
||||
@ which black >/dev/null || python3 -m pip install black
|
||||
@ which isort >/dev/null || python3 -m pip install isort
|
||||
cargo fmt --check -- --config group_imports=StdExternalCrate
|
||||
black --check --line-length 100 --skip-magic-trailing-comma --quiet .
|
||||
isort --check --line-length 100 --profile black --quiet .
|
||||
|
||||
.PHONY: lint
|
||||
lint:
|
||||
cargo clippy --all -- -D warnings
|
||||
@ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11
|
||||
flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build .
|
||||
|
||||
.PHONY: editable_install
|
||||
editable_install:
|
||||
@ if [ -f $(PROJECT).egg-info ]; then \
|
||||
pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \
|
||||
pip install --disable-pip-version-check --no-build-isolation -e . ; \
|
||||
else \
|
||||
pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \
|
||||
fi
|
76
README.md
76
README.md
@ -7,6 +7,9 @@ OpenAI's models.
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
assert enc.decode(enc.encode("hello world")) == "hello world"
|
||||
|
||||
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
||||
enc = tiktoken.encoding_for_model("text-davinci-003")
|
||||
```
|
||||
|
||||
The open source version of `tiktoken` can be installed from PyPI:
|
||||
@ -16,7 +19,9 @@ pip install tiktoken
|
||||
|
||||
The tokeniser API is documented in `tiktoken/core.py`.
|
||||
|
||||
Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
||||
Example code using `tiktoken` can be found in the
|
||||
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni
|
||||
`tokenizers==0.13.2` and `transformers==4.24.0`.
|
||||
|
||||
|
||||
## Getting help
|
||||
|
||||
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
|
||||
|
||||
If you work at OpenAI, make sure to check the internal documentation or feel free to contact
|
||||
@shantanu.
|
||||
|
||||
|
||||
## Extending tiktoken
|
||||
|
||||
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
|
||||
|
||||
|
||||
**Create your `Encoding` object exactly the way you want and simply pass it around.**
|
||||
|
||||
```python
|
||||
cl100k_base = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
# In production, load the arguments directly instead of accessing private attributes
|
||||
# See openai_public.py for examples of arguments for specific encodings
|
||||
enc = tiktoken.Encoding(
|
||||
# If you're changing the set of special tokens, make sure to use a different name
|
||||
# It should be clear from the name what behaviour to expect.
|
||||
name="cl100k_im",
|
||||
pat_str=cl100k_base._pat_str,
|
||||
mergeable_ranks=cl100k_base._mergeable_ranks,
|
||||
special_tokens={
|
||||
**cl100k_base._special_tokens,
|
||||
"<|im_start|>": 100264,
|
||||
"<|im_end|>": 100265,
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
|
||||
|
||||
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
|
||||
option 1.
|
||||
|
||||
To do this, you'll need to create a namespace package under `tiktoken_ext`.
|
||||
|
||||
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
|
||||
```
|
||||
my_tiktoken_extension
|
||||
├── tiktoken_ext
|
||||
│ └── my_encodings.py
|
||||
└── setup.py
|
||||
```
|
||||
|
||||
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
|
||||
This is a dictionary from an encoding name to a function that takes no arguments and returns
|
||||
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
|
||||
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
|
||||
|
||||
Your `setup.py` should look something like this:
|
||||
```python
|
||||
from setuptools import setup, find_namespace_packages
|
||||
|
||||
setup(
|
||||
name="my_tiktoken_extension",
|
||||
packages=find_namespace_packages(include=['tiktoken_ext.*'])
|
||||
install_requires=["tiktoken"],
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings!
|
||||
Make sure **not** to use an editable install.
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
[project]
|
||||
name = "tiktoken"
|
||||
version = "0.2.0"
|
||||
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
|
||||
dynamic = ["version"]
|
||||
requires-python = ">=3.8"
|
||||
|
||||
[build-system]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
|
||||
requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build-frontend = "build"
|
||||
|
6
setup.py
6
setup.py
@ -1,14 +1,8 @@
|
||||
from setuptools import setup
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
|
||||
public = True
|
||||
|
||||
if public:
|
||||
version = "0.1.2"
|
||||
|
||||
setup(
|
||||
name="tiktoken",
|
||||
version=version,
|
||||
rust_extensions=[
|
||||
RustExtension(
|
||||
"tiktoken._tiktoken",
|
||||
|
@ -17,3 +17,10 @@ def test_simple():
|
||||
enc = tiktoken.get_encoding(enc_name)
|
||||
for token in range(10_000):
|
||||
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
|
||||
|
||||
|
||||
def test_encoding_for_model():
|
||||
enc = tiktoken.encoding_for_model("gpt2")
|
||||
assert enc.name == "gpt2"
|
||||
enc = tiktoken.encoding_for_model("text-davinci-003")
|
||||
assert enc.name == "p50k_base"
|
||||
|
@ -1,3 +1,4 @@
|
||||
from .core import Encoding as Encoding
|
||||
from .model import encoding_for_model as encoding_for_model
|
||||
from .registry import get_encoding as get_encoding
|
||||
from .registry import list_encoding_names as list_encoding_names
|
||||
|
@ -19,6 +19,21 @@ class Encoding:
|
||||
special_tokens: dict[str, int],
|
||||
explicit_n_vocab: Optional[int] = None,
|
||||
):
|
||||
"""Creates an Encoding object.
|
||||
|
||||
See openai_public.py for examples of how to construct an Encoding object.
|
||||
|
||||
Args:
|
||||
name: The name of the encoding. It should be clear from the name of the encoding
|
||||
what behaviour to expect, in particular, encodings with different special tokens
|
||||
should have different names.
|
||||
pat_str: A regex pattern string that is used to split the input text.
|
||||
mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
|
||||
must correspond to merge priority.
|
||||
special_tokens: A dictionary mapping special token strings to their token values.
|
||||
explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
|
||||
that the number of mergeable tokens and special tokens is equal to this number.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
self._pat_str = pat_str
|
||||
|
@ -4,6 +4,7 @@ import base64
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import blobfile
|
||||
@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes:
|
||||
elif "DATA_GYM_CACHE_DIR" in os.environ:
|
||||
cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
|
||||
else:
|
||||
cache_dir = "/tmp/data-gym-cache"
|
||||
cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
|
||||
|
||||
if cache_dir == "":
|
||||
# disable caching
|
||||
|
55
tiktoken/model.py
Normal file
55
tiktoken/model.py
Normal file
@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .core import Encoding
|
||||
from .registry import get_encoding
|
||||
|
||||
# TODO: this will likely be replaced by an API endpoint
|
||||
MODEL_TO_ENCODING: dict[str, str] = {
|
||||
# text
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
"text-davinci-001": "r50k_base",
|
||||
"text-curie-001": "r50k_base",
|
||||
"text-babbage-001": "r50k_base",
|
||||
"text-ada-001": "r50k_base",
|
||||
"davinci": "r50k_base",
|
||||
"curie": "r50k_base",
|
||||
"babbage": "r50k_base",
|
||||
"ada": "r50k_base",
|
||||
# code
|
||||
"code-davinci-002": "p50k_base",
|
||||
"code-davinci-001": "p50k_base",
|
||||
"code-cushman-002": "p50k_base",
|
||||
"code-cushman-001": "p50k_base",
|
||||
"davinci-codex": "p50k_base",
|
||||
"cushman-codex": "p50k_base",
|
||||
# edit
|
||||
"text-davinci-edit-001": "p50k_edit",
|
||||
"code-davinci-edit-001": "p50k_edit",
|
||||
# embeddings
|
||||
"text-embedding-ada-002": "cl100k_base",
|
||||
# old embeddings
|
||||
"text-similarity-davinci-001": "r50k_base",
|
||||
"text-similarity-curie-001": "r50k_base",
|
||||
"text-similarity-babbage-001": "r50k_base",
|
||||
"text-similarity-ada-001": "r50k_base",
|
||||
"text-search-davinci-doc-001": "r50k_base",
|
||||
"text-search-curie-doc-001": "r50k_base",
|
||||
"text-search-babbage-doc-001": "r50k_base",
|
||||
"text-search-ada-doc-001": "r50k_base",
|
||||
"code-search-babbage-code-001": "r50k_base",
|
||||
"code-search-ada-code-001": "r50k_base",
|
||||
# open source
|
||||
"gpt2": "gpt2",
|
||||
}
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
try:
|
||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||
except KeyError:
|
||||
raise KeyError(
|
||||
f"Could not automatically map {model_name} to a tokeniser. "
|
||||
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
return get_encoding(encoding_name)
|
@ -47,6 +47,19 @@ def p50k_base():
|
||||
}
|
||||
|
||||
|
||||
def p50k_edit():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
)
|
||||
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
|
||||
return {
|
||||
"name": "p50k_edit",
|
||||
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
||||
"mergeable_ranks": mergeable_ranks,
|
||||
"special_tokens": special_tokens,
|
||||
}
|
||||
|
||||
|
||||
def cl100k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
|
Loading…
x
Reference in New Issue
Block a user