Bump version, sync codebase

This commit is contained in:
Shantanu Jain 2023-03-16 18:11:50 -07:00
parent 3e8620030c
commit 446cb49aff
5 changed files with 10 additions and 5 deletions

View File

@ -2,6 +2,9 @@
This is the changelog for the open source version of tiktoken. This is the changelog for the open source version of tiktoken.
## [v0.3.2]
- Add encoding for GPT-4
## [v0.3.1] ## [v0.3.1]
- Build aarch64 wheels - Build aarch64 wheels
- Make `blobfile` an optional dependency - Make `blobfile` an optional dependency

View File

@ -1,6 +1,6 @@
[package] [package]
name = "tiktoken" name = "tiktoken"
version = "0.3.1" version = "0.3.2"
edition = "2021" edition = "2021"
rust-version = "1.57.0" rust-version = "1.57.0"

View File

@ -5,11 +5,11 @@ OpenAI's models.
```python ```python
import tiktoken import tiktoken
enc = tiktoken.get_encoding("gpt2") enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world" assert enc.decode(enc.encode("hello world")) == "hello world"
# To get the tokeniser corresponding to a specific model in the OpenAI API: # To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("text-davinci-003") enc = tiktoken.encoding_for_model("gpt-4")
``` ```
The open source version of `tiktoken` can be installed from PyPI: The open source version of `tiktoken` can be installed from PyPI:

View File

@ -34,7 +34,7 @@ fn _byte_pair_merge<T>(
} }
}; };
// We look up the ranks once in the beggining and iteratively update // We look up the ranks once in the beginning and iteratively update
// them during each merge, which reduces the number of rank lookups. // them during each merge, which reduces the number of rank lookups.
for i in 0..parts.len() - 2 { for i in 0..parts.len() - 2 {
match get_rank(&parts, i, 0) { match get_rank(&parts, i, 0) {

View File

@ -6,11 +6,13 @@ from .registry import get_encoding
# TODO: these will likely be replaced by an API endpoint # TODO: these will likely be replaced by an API endpoint
MODEL_PREFIX_TO_ENCODING: dict[str, str] = { MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
# chat # chat
"gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
} }
MODEL_TO_ENCODING: dict[str, str] = { MODEL_TO_ENCODING: dict[str, str] = {
# chat # chat
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base", "gpt-3.5-turbo": "cl100k_base",
# text # text
"text-davinci-003": "p50k_base", "text-davinci-003": "p50k_base",