From 7830ed537badecefb5a357448be722bfd58f9fca Mon Sep 17 00:00:00 2001 From: Shantanu Jain Date: Fri, 3 Feb 2023 11:46:09 -0800 Subject: [PATCH] Bump version, sync codebase --- CHANGELOG.md | 4 ++ Cargo.toml | 2 +- Makefile | 49 ---------------------- README.md | 76 ++++++++++++++++++++++++++++++++++- pyproject.toml | 4 +- setup.py | 6 --- tests/test_simple_public.py | 7 ++++ tiktoken/__init__.py | 1 + tiktoken/core.py | 15 +++++++ tiktoken/load.py | 3 +- tiktoken/model.py | 55 +++++++++++++++++++++++++ tiktoken_ext/openai_public.py | 13 ++++++ 12 files changed, 175 insertions(+), 60 deletions(-) delete mode 100644 Makefile create mode 100644 tiktoken/model.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a606553..114ffff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ This is the changelog for the open source version of tiktoken. +## [v0.2.0] +- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model +- Improve portability of caching logic + ## [v0.1.2] - Avoid use of `blobfile` for public files - Add support for Python 3.8 diff --git a/Cargo.toml b/Cargo.toml index 24b42fd..1fb806b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.1.0" +version = "0.2.0" edition = "2021" rust-version = "1.57.0" diff --git a/Makefile b/Makefile deleted file mode 100644 index 92aec0f..0000000 --- a/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -PROJECT := tiktoken - -.PHONY: default -default: editable_install - -.PHONY: install_rust -install_rust: - which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62 - -.PHONY: clean -clean: - cargo clean - pip uninstall -y $(PROJECT) - find . | grep -E '__pycache__|\.pyc' | xargs rm -rf - find . | grep -E '\.so' | xargs rm -rf - rm -rf dist/ build/ - rm -rf $(PROJECT).egg-info/ - -.PHONY: format -format: - @ which black >/dev/null || python3 -m pip install black - @ which isort >/dev/null || python3 -m pip install isort - cargo fmt -- --config group_imports=StdExternalCrate - black --line-length 100 --skip-magic-trailing-comma --quiet . - isort --line-length 100 --profile black --quiet . - - -.PHONY: format_check -format_check: - @ which black >/dev/null || python3 -m pip install black - @ which isort >/dev/null || python3 -m pip install isort - cargo fmt --check -- --config group_imports=StdExternalCrate - black --check --line-length 100 --skip-magic-trailing-comma --quiet . - isort --check --line-length 100 --profile black --quiet . - -.PHONY: lint -lint: - cargo clippy --all -- -D warnings - @ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11 - flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build . - -.PHONY: editable_install -editable_install: - @ if [ -f $(PROJECT).egg-info ]; then \ - pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \ - pip install --disable-pip-version-check --no-build-isolation -e . ; \ - else \ - pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \ - fi diff --git a/README.md b/README.md index 36b9af5..d9e461d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,9 @@ OpenAI's models. import tiktoken enc = tiktoken.get_encoding("gpt2") assert enc.decode(enc.encode("hello world")) == "hello world" + +# To get the tokeniser corresponding to a specific model in the OpenAI API: +enc = tiktoken.encoding_for_model("text-davinci-003") ``` The open source version of `tiktoken` can be installed from PyPI: @@ -16,7 +19,9 @@ pip install tiktoken The tokeniser API is documented in `tiktoken/core.py`. -Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). +Example code using `tiktoken` can be found in the +[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). + ## Performance @@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni `tokenizers==0.13.2` and `transformers==4.24.0`. +## Getting help + +Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues). + +If you work at OpenAI, make sure to check the internal documentation or feel free to contact +@shantanu. + + +## Extending tiktoken + +You may wish to extend `tiktoken` to support new encodings. There are two ways to do this. + + +**Create your `Encoding` object exactly the way you want and simply pass it around.** + +```python +cl100k_base = tiktoken.get_encoding("cl100k_base") + +# In production, load the arguments directly instead of accessing private attributes +# See openai_public.py for examples of arguments for specific encodings +enc = tiktoken.Encoding( + # If you're changing the set of special tokens, make sure to use a different name + # It should be clear from the name what behaviour to expect. + name="cl100k_im", + pat_str=cl100k_base._pat_str, + mergeable_ranks=cl100k_base._mergeable_ranks, + special_tokens={ + **cl100k_base._special_tokens, + "<|im_start|>": 100264, + "<|im_end|>": 100265, + } +) +``` + +**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.** + +This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer +option 1. + +To do this, you'll need to create a namespace package under `tiktoken_ext`. + +Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file: +``` +my_tiktoken_extension +├── tiktoken_ext +│   └── my_encodings.py +└── setup.py +``` + +`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`. +This is a dictionary from an encoding name to a function that takes no arguments and returns +arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see +`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`. + +Your `setup.py` should look something like this: +```python +from setuptools import setup, find_namespace_packages + +setup( + name="my_tiktoken_extension", + packages=find_namespace_packages(include=['tiktoken_ext.*']) + install_requires=["tiktoken"], + ... +) +``` + +Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings! +Make sure **not** to use an editable install. + diff --git a/pyproject.toml b/pyproject.toml index 4fcd2ec..6fb57bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [project] name = "tiktoken" +version = "0.2.0" dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"] -dynamic = ["version"] requires-python = ">=3.8" [build-system] build-backend = "setuptools.build_meta" -requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"] +requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"] [tool.cibuildwheel] build-frontend = "build" diff --git a/setup.py b/setup.py index 179392b..a22e8e5 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,8 @@ from setuptools import setup from setuptools_rust import Binding, RustExtension -public = True - -if public: - version = "0.1.2" - setup( name="tiktoken", - version=version, rust_extensions=[ RustExtension( "tiktoken._tiktoken", diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py index 2987e4e..4410923 100644 --- a/tests/test_simple_public.py +++ b/tests/test_simple_public.py @@ -17,3 +17,10 @@ def test_simple(): enc = tiktoken.get_encoding(enc_name) for token in range(10_000): assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token + + +def test_encoding_for_model(): + enc = tiktoken.encoding_for_model("gpt2") + assert enc.name == "gpt2" + enc = tiktoken.encoding_for_model("text-davinci-003") + assert enc.name == "p50k_base" diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py index f4b5065..9ad09a3 100644 --- a/tiktoken/__init__.py +++ b/tiktoken/__init__.py @@ -1,3 +1,4 @@ from .core import Encoding as Encoding +from .model import encoding_for_model as encoding_for_model from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names diff --git a/tiktoken/core.py b/tiktoken/core.py index b3a475a..05613aa 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -19,6 +19,21 @@ class Encoding: special_tokens: dict[str, int], explicit_n_vocab: Optional[int] = None, ): + """Creates an Encoding object. + + See openai_public.py for examples of how to construct an Encoding object. + + Args: + name: The name of the encoding. It should be clear from the name of the encoding + what behaviour to expect, in particular, encodings with different special tokens + should have different names. + pat_str: A regex pattern string that is used to split the input text. + mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks + must correspond to merge priority. + special_tokens: A dictionary mapping special token strings to their token values. + explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked + that the number of mergeable tokens and special tokens is equal to this number. + """ self.name = name self._pat_str = pat_str diff --git a/tiktoken/load.py b/tiktoken/load.py index fefd62a..c588106 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -4,6 +4,7 @@ import base64 import hashlib import json import os +import tempfile import uuid import blobfile @@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes: elif "DATA_GYM_CACHE_DIR" in os.environ: cache_dir = os.environ["DATA_GYM_CACHE_DIR"] else: - cache_dir = "/tmp/data-gym-cache" + cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache") if cache_dir == "": # disable caching diff --git a/tiktoken/model.py b/tiktoken/model.py new file mode 100644 index 0000000..66e9e04 --- /dev/null +++ b/tiktoken/model.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from .core import Encoding +from .registry import get_encoding + +# TODO: this will likely be replaced by an API endpoint +MODEL_TO_ENCODING: dict[str, str] = { + # text + "text-davinci-003": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-001": "r50k_base", + "text-curie-001": "r50k_base", + "text-babbage-001": "r50k_base", + "text-ada-001": "r50k_base", + "davinci": "r50k_base", + "curie": "r50k_base", + "babbage": "r50k_base", + "ada": "r50k_base", + # code + "code-davinci-002": "p50k_base", + "code-davinci-001": "p50k_base", + "code-cushman-002": "p50k_base", + "code-cushman-001": "p50k_base", + "davinci-codex": "p50k_base", + "cushman-codex": "p50k_base", + # edit + "text-davinci-edit-001": "p50k_edit", + "code-davinci-edit-001": "p50k_edit", + # embeddings + "text-embedding-ada-002": "cl100k_base", + # old embeddings + "text-similarity-davinci-001": "r50k_base", + "text-similarity-curie-001": "r50k_base", + "text-similarity-babbage-001": "r50k_base", + "text-similarity-ada-001": "r50k_base", + "text-search-davinci-doc-001": "r50k_base", + "text-search-curie-doc-001": "r50k_base", + "text-search-babbage-doc-001": "r50k_base", + "text-search-ada-doc-001": "r50k_base", + "code-search-babbage-code-001": "r50k_base", + "code-search-ada-code-001": "r50k_base", + # open source + "gpt2": "gpt2", +} + + +def encoding_for_model(model_name: str) -> Encoding: + try: + encoding_name = MODEL_TO_ENCODING[model_name] + except KeyError: + raise KeyError( + f"Could not automatically map {model_name} to a tokeniser. " + "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect." + ) from None + return get_encoding(encoding_name) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index b0ec6ab..a64db9f 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -47,6 +47,19 @@ def p50k_base(): } +def p50k_edit(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" + ) + special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} + return { + "name": "p50k_edit", + "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + def cl100k_base(): mergeable_ranks = load_tiktoken_bpe( "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"