Bump version, sync codebase
This commit is contained in:
parent
f5fbc9c5e9
commit
ec7c121e38
@ -2,6 +2,14 @@
|
|||||||
|
|
||||||
This is the changelog for the open source version of tiktoken.
|
This is the changelog for the open source version of tiktoken.
|
||||||
|
|
||||||
|
## [v0.3.0]
|
||||||
|
- Improve performance by 5-20%; thank you to @nistath!
|
||||||
|
- Add `gpt-3.5-turbo` models to `encoding_for_model`
|
||||||
|
- Add prefix matching to `encoding_for_model` to better support future model versions
|
||||||
|
- Fix a bug in the README instructions on extending tiktoken
|
||||||
|
- Update the set of available encodings
|
||||||
|
- Add packaging metadata
|
||||||
|
|
||||||
## [v0.2.0]
|
## [v0.2.0]
|
||||||
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
||||||
- Improve portability of caching logic
|
- Improve portability of caching logic
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.2.0"
|
version = "0.3.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.57.0"
|
rust-version = "1.57.0"
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ Example code using `tiktoken` can be found in the
|
|||||||

|

|
||||||
|
|
||||||
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
|
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
|
||||||
`tokenizers==0.13.2` and `transformers==4.24.0`.
|
`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
|
||||||
|
|
||||||
|
|
||||||
## Getting help
|
## Getting help
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.2.0"
|
version = "0.3.0"
|
||||||
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = {file = "LICENSE"}
|
license = {file = "LICENSE"}
|
||||||
|
@ -26,3 +26,5 @@ def test_encoding_for_model():
|
|||||||
assert enc.name == "p50k_base"
|
assert enc.name == "p50k_base"
|
||||||
enc = tiktoken.encoding_for_model("text-davinci-edit-001")
|
enc = tiktoken.encoding_for_model("text-davinci-edit-001")
|
||||||
assert enc.name == "p50k_edit"
|
assert enc.name == "p50k_edit"
|
||||||
|
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
||||||
|
assert enc.name == "cl100k_base"
|
||||||
|
@ -3,8 +3,15 @@ from __future__ import annotations
|
|||||||
from .core import Encoding
|
from .core import Encoding
|
||||||
from .registry import get_encoding
|
from .registry import get_encoding
|
||||||
|
|
||||||
# TODO: this will likely be replaced by an API endpoint
|
# TODO: these will likely be replaced by an API endpoint
|
||||||
|
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||||
|
# chat
|
||||||
|
"gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||||
|
}
|
||||||
|
|
||||||
MODEL_TO_ENCODING: dict[str, str] = {
|
MODEL_TO_ENCODING: dict[str, str] = {
|
||||||
|
# chat
|
||||||
|
"gpt-3.5-turbo": "cl100k_base",
|
||||||
# text
|
# text
|
||||||
"text-davinci-003": "p50k_base",
|
"text-davinci-003": "p50k_base",
|
||||||
"text-davinci-002": "p50k_base",
|
"text-davinci-002": "p50k_base",
|
||||||
@ -45,11 +52,22 @@ MODEL_TO_ENCODING: dict[str, str] = {
|
|||||||
|
|
||||||
|
|
||||||
def encoding_for_model(model_name: str) -> Encoding:
|
def encoding_for_model(model_name: str) -> Encoding:
|
||||||
try:
|
"""Returns the encoding used by a model."""
|
||||||
|
encoding_name = None
|
||||||
|
if model_name in MODEL_TO_ENCODING:
|
||||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||||
except KeyError:
|
else:
|
||||||
|
# Check if the model matches a known prefix
|
||||||
|
# Prefix matching avoids needing library updates for every model version release
|
||||||
|
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
|
||||||
|
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
|
||||||
|
if model_name.startswith(model_prefix):
|
||||||
|
return get_encoding(model_encoding_name)
|
||||||
|
|
||||||
|
if encoding_name is None:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
f"Could not automatically map {model_name} to a tokeniser. "
|
f"Could not automatically map {model_name} to a tokeniser. "
|
||||||
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
|
||||||
) from None
|
) from None
|
||||||
|
|
||||||
return get_encoding(encoding_name)
|
return get_encoding(encoding_name)
|
||||||
|
@ -83,6 +83,6 @@ ENCODING_CONSTRUCTORS = {
|
|||||||
"gpt2": gpt2,
|
"gpt2": gpt2,
|
||||||
"r50k_base": r50k_base,
|
"r50k_base": r50k_base,
|
||||||
"p50k_base": p50k_base,
|
"p50k_base": p50k_base,
|
||||||
"cl100k_base": cl100k_base,
|
|
||||||
"p50k_edit": p50k_edit,
|
"p50k_edit": p50k_edit,
|
||||||
|
"cl100k_base": cl100k_base,
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user