Bump version, sync codebase

2023-02-03 11:46:09 -08:00 · 2023-02-03 11:46:09 -08:00 · 7830ed537b
commit 7830ed537b
parent 156eff92d2
12 changed files with 175 additions and 60 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,10 @@

 This is the changelog for the open source version of tiktoken.

+## [v0.2.0]
+- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
+- Improve portability of caching logic
+
 ## [v0.1.2]
 - Avoid use of `blobfile` for public files
 - Add support for Python 3.8
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 rust-version = "1.57.0"

--- a/49
+++ b/49
@ -1,49 +0,0 @@
-PROJECT := tiktoken
-
-.PHONY: default
-default: editable_install
-
-.PHONY: install_rust
-install_rust:
-	which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62
-
-.PHONY: clean
-clean:
-	cargo clean
-	pip uninstall -y $(PROJECT)
-	find . | grep -E '__pycache__|\.pyc' | xargs rm -rf
-	find . | grep -E '\.so' | xargs rm -rf
-	rm -rf dist/ build/
-	rm -rf $(PROJECT).egg-info/
-
-.PHONY: format
-format:
-	@ which black >/dev/null || python3 -m pip install black
-	@ which isort >/dev/null || python3 -m pip install isort
-	cargo fmt -- --config group_imports=StdExternalCrate
-	black --line-length 100 --skip-magic-trailing-comma --quiet .
-	isort --line-length 100 --profile black --quiet .
-
-
-.PHONY: format_check
-format_check:
-	@ which black >/dev/null || python3 -m pip install black
-	@ which isort >/dev/null || python3 -m pip install isort
-	cargo fmt --check -- --config group_imports=StdExternalCrate
-	black --check --line-length 100 --skip-magic-trailing-comma --quiet .
-	isort --check --line-length 100 --profile black --quiet .
-
-.PHONY: lint
-lint:
-	cargo clippy --all -- -D warnings
-	@ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11
-	flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build .
-
-.PHONY: editable_install
-editable_install:
-	@ if [ -f $(PROJECT).egg-info ]; then \
-		pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \
-		pip install --disable-pip-version-check --no-build-isolation -e . ; \
-	else \
-		pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \
-	fi
--- a/README.md
+++ b/README.md
@ -7,6 +7,9 @@ OpenAI's models.
 import tiktoken
 enc = tiktoken.get_encoding("gpt2")
 assert enc.decode(enc.encode("hello world")) == "hello world"
+
+# To get the tokeniser corresponding to a specific model in the OpenAI API:
+enc = tiktoken.encoding_for_model("text-davinci-003")
 ```

 The open source version of `tiktoken` can be installed from PyPI:
@ -16,7 +19,9 @@ pip install tiktoken

 The tokeniser API is documented in `tiktoken/core.py`.

-Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
+Example code using `tiktoken` can be found in the
+[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
+

 ## Performance

@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni
 `tokenizers==0.13.2` and `transformers==4.24.0`.


+## Getting help
+
+Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
+
+If you work at OpenAI, make sure to check the internal documentation or feel free to contact
+@shantanu.
+
+
+## Extending tiktoken
+
+You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
+
+
+**Create your `Encoding` object exactly the way you want and simply pass it around.**
+
+```python
+cl100k_base = tiktoken.get_encoding("cl100k_base")
+
+# In production, load the arguments directly instead of accessing private attributes
+# See openai_public.py for examples of arguments for specific encodings
+enc = tiktoken.Encoding(
+    # If you're changing the set of special tokens, make sure to use a different name
+    # It should be clear from the name what behaviour to expect.
+    name="cl100k_im",
+    pat_str=cl100k_base._pat_str,
+    mergeable_ranks=cl100k_base._mergeable_ranks,
+    special_tokens={
+        **cl100k_base._special_tokens,
+        "<|im_start|>": 100264,
+        "<|im_end|>": 100265,
+    }
+)
+```
+
+**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
+
+This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
+option 1.
+
+To do this, you'll need to create a namespace package under `tiktoken_ext`.
+
+Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
+```
+my_tiktoken_extension
+├── tiktoken_ext
+│   └── my_encodings.py
+└── setup.py
+```
+
+`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
+This is a dictionary from an encoding name to a function that takes no arguments and returns
+arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
+`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
+
+Your `setup.py` should look something like this:
+```python
+from setuptools import setup, find_namespace_packages
+
+setup(
+    name="my_tiktoken_extension",
+    packages=find_namespace_packages(include=['tiktoken_ext.*'])
+    install_requires=["tiktoken"],
+    ...
+)
+```
+
+Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings!
+Make sure **not** to use an editable install.
+
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,12 +1,12 @@
 [project]
 name = "tiktoken"
+version = "0.2.0"
 dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
-dynamic = ["version"]
 requires-python = ">=3.8"

 [build-system]
 build-backend = "setuptools.build_meta"
-requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
+requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]

 [tool.cibuildwheel]
 build-frontend = "build"
--- a/setup.py
+++ b/setup.py
@ -1,14 +1,8 @@
 from setuptools import setup
 from setuptools_rust import Binding, RustExtension

-public = True
-
-if public:
-    version = "0.1.2"
-
 setup(
    name="tiktoken",
-    version=version,
    rust_extensions=[
        RustExtension(
            "tiktoken._tiktoken",
--- a/tests/test_simple_public.py
+++ b/tests/test_simple_public.py
@ -17,3 +17,10 @@ def test_simple():
        enc = tiktoken.get_encoding(enc_name)
        for token in range(10_000):
            assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
+
+
+def test_encoding_for_model():
+    enc = tiktoken.encoding_for_model("gpt2")
+    assert enc.name == "gpt2"
+    enc = tiktoken.encoding_for_model("text-davinci-003")
+    assert enc.name == "p50k_base"
--- a/tiktoken/init.py
+++ b/tiktoken/init.py
@ -1,3 +1,4 @@
 from .core import Encoding as Encoding
+from .model import encoding_for_model as encoding_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
--- a/tiktoken/core.py
+++ b/tiktoken/core.py
@ -19,6 +19,21 @@ class Encoding:
        special_tokens: dict[str, int],
        explicit_n_vocab: Optional[int] = None,
    ):
+        """Creates an Encoding object.
+
+        See openai_public.py for examples of how to construct an Encoding object.
+
+        Args:
+            name: The name of the encoding. It should be clear from the name of the encoding
+                what behaviour to expect, in particular, encodings with different special tokens
+                should have different names.
+            pat_str: A regex pattern string that is used to split the input text.
+            mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
+                must correspond to merge priority.
+            special_tokens: A dictionary mapping special token strings to their token values.
+            explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
+                that the number of mergeable tokens and special tokens is equal to this number.
+        """
        self.name = name

        self._pat_str = pat_str
--- a/tiktoken/load.py
+++ b/tiktoken/load.py
@ -4,6 +4,7 @@ import base64
 import hashlib
 import json
 import os
+import tempfile
 import uuid

 import blobfile
@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes:
    elif "DATA_GYM_CACHE_DIR" in os.environ:
        cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
    else:
-        cache_dir = "/tmp/data-gym-cache"
+        cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")

    if cache_dir == "":
        # disable caching
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from .core import Encoding
+from .registry import get_encoding
+
+# TODO: this will likely be replaced by an API endpoint
+MODEL_TO_ENCODING: dict[str, str] = {
+    # text
+    "text-davinci-003": "p50k_base",
+    "text-davinci-002": "p50k_base",
+    "text-davinci-001": "r50k_base",
+    "text-curie-001": "r50k_base",
+    "text-babbage-001": "r50k_base",
+    "text-ada-001": "r50k_base",
+    "davinci": "r50k_base",
+    "curie": "r50k_base",
+    "babbage": "r50k_base",
+    "ada": "r50k_base",
+    # code
+    "code-davinci-002": "p50k_base",
+    "code-davinci-001": "p50k_base",
+    "code-cushman-002": "p50k_base",
+    "code-cushman-001": "p50k_base",
+    "davinci-codex": "p50k_base",
+    "cushman-codex": "p50k_base",
+    # edit
+    "text-davinci-edit-001": "p50k_edit",
+    "code-davinci-edit-001": "p50k_edit",
+    # embeddings
+    "text-embedding-ada-002": "cl100k_base",
+    # old embeddings
+    "text-similarity-davinci-001": "r50k_base",
+    "text-similarity-curie-001": "r50k_base",
+    "text-similarity-babbage-001": "r50k_base",
+    "text-similarity-ada-001": "r50k_base",
+    "text-search-davinci-doc-001": "r50k_base",
+    "text-search-curie-doc-001": "r50k_base",
+    "text-search-babbage-doc-001": "r50k_base",
+    "text-search-ada-doc-001": "r50k_base",
+    "code-search-babbage-code-001": "r50k_base",
+    "code-search-ada-code-001": "r50k_base",
+    # open source
+    "gpt2": "gpt2",
+}
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    try:
+        encoding_name = MODEL_TO_ENCODING[model_name]
+    except KeyError:
+        raise KeyError(
+            f"Could not automatically map {model_name} to a tokeniser. "
+            "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
+        ) from None
+    return get_encoding(encoding_name)
--- a/tiktoken_ext/openai_public.py
+++ b/tiktoken_ext/openai_public.py
@ -47,6 +47,19 @@ def p50k_base():
    }


+def p50k_edit():
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
+    )
+    special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
+    return {
+        "name": "p50k_edit",
+        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+
 def cl100k_base():
    mergeable_ranks = load_tiktoken_bpe(
        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"