Bump version, sync codebase
This commit is contained in:
parent
b2e85f1423
commit
3e8620030c
@ -2,6 +2,12 @@
|
|||||||
|
|
||||||
This is the changelog for the open source version of tiktoken.
|
This is the changelog for the open source version of tiktoken.
|
||||||
|
|
||||||
|
## [v0.3.1]
|
||||||
|
- Build aarch64 wheels
|
||||||
|
- Make `blobfile` an optional dependency
|
||||||
|
|
||||||
|
Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
|
||||||
|
|
||||||
## [v0.3.0]
|
## [v0.3.0]
|
||||||
- Improve performance by 5-20%; thank you to @nistath!
|
- Improve performance by 5-20%; thank you to @nistath!
|
||||||
- Add `gpt-3.5-turbo` models to `encoding_for_model`
|
- Add `gpt-3.5-turbo` models to `encoding_for_model`
|
||||||
@ -14,6 +20,8 @@ This is the changelog for the open source version of tiktoken.
|
|||||||
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
|
||||||
- Improve portability of caching logic
|
- Improve portability of caching logic
|
||||||
|
|
||||||
|
Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
|
||||||
|
|
||||||
## [v0.1.2]
|
## [v0.1.2]
|
||||||
- Avoid use of `blobfile` for public files
|
- Avoid use of `blobfile` for public files
|
||||||
- Add support for Python 3.8
|
- Add support for Python 3.8
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.57.0"
|
rust-version = "1.57.0"
|
||||||
|
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.3.0"
|
version = "0.3.1"
|
||||||
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = {file = "LICENSE"}
|
license = {file = "LICENSE"}
|
||||||
authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
|
authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
|
||||||
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
|
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
|
||||||
|
optional-dependencies = {blobfile = ["blobfile>=2"]}
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.8"
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
|
27
src/lib.rs
27
src/lib.rs
@ -21,32 +21,23 @@ fn _byte_pair_merge<T>(
|
|||||||
// The rank of the last item in the vector is not a valid value.
|
// The rank of the last item in the vector is not a valid value.
|
||||||
let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect();
|
let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect();
|
||||||
|
|
||||||
// NOTE: using a macro here because a closure fails to get inlined
|
let get_rank = {
|
||||||
// according to optimization remarks.
|
#[inline(always)]
|
||||||
// A closure also cannot capture a reference to `piece` without
|
|parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| {
|
||||||
// the borrow checker complaining about the mutable borrows during
|
|
||||||
// the assignments later in this code.
|
|
||||||
macro_rules! get_rank {
|
|
||||||
($start_idx:expr, $skip:expr) => {{
|
|
||||||
let start_idx: usize = $start_idx;
|
|
||||||
let skip: usize = $skip;
|
|
||||||
if (start_idx + skip + 2) < parts.len() {
|
if (start_idx + skip + 2) < parts.len() {
|
||||||
ranks
|
ranks
|
||||||
.get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0])
|
.get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0])
|
||||||
.map(|r| *r)
|
.copied()
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}};
|
}
|
||||||
($idx:expr) => {{
|
};
|
||||||
get_rank!($idx, 0)
|
|
||||||
}};
|
|
||||||
}
|
|
||||||
|
|
||||||
// We look up the ranks once in the beggining and iteratively update
|
// We look up the ranks once in the beggining and iteratively update
|
||||||
// them during each merge, which reduces the number of rank lookups.
|
// them during each merge, which reduces the number of rank lookups.
|
||||||
for i in 0..parts.len() - 2 {
|
for i in 0..parts.len() - 2 {
|
||||||
match get_rank!(i) {
|
match get_rank(&parts, i, 0) {
|
||||||
Some(rank) => {
|
Some(rank) => {
|
||||||
// usize::MAX is a sentinel value and cannot be a valid rank
|
// usize::MAX is a sentinel value and cannot be a valid rank
|
||||||
debug_assert!(rank != usize::MAX);
|
debug_assert!(rank != usize::MAX);
|
||||||
@ -89,9 +80,9 @@ fn _byte_pair_merge<T>(
|
|||||||
// parts[i] and parts[i-1] before removing, which could thrash
|
// parts[i] and parts[i-1] before removing, which could thrash
|
||||||
// the cache. Thus, we update the rank calculation by skipping over
|
// the cache. Thus, we update the rank calculation by skipping over
|
||||||
// parts[i + 1], by invoking `get_rank!` with `skip = 1`.
|
// parts[i + 1], by invoking `get_rank!` with `skip = 1`.
|
||||||
parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX);
|
parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX);
|
||||||
if i > 0 {
|
if i > 0 {
|
||||||
parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX);
|
parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
parts.remove(i + 1);
|
parts.remove(i + 1);
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
|
|
||||||
@ -28,3 +31,12 @@ def test_encoding_for_model():
|
|||||||
assert enc.name == "p50k_edit"
|
assert enc.name == "p50k_edit"
|
||||||
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
||||||
assert enc.name == "cl100k_base"
|
assert enc.name == "cl100k_base"
|
||||||
|
|
||||||
|
|
||||||
|
def test_optional_blobfile_dependency():
|
||||||
|
prog = """
|
||||||
|
import tiktoken
|
||||||
|
import sys
|
||||||
|
assert "blobfile" not in sys.modules
|
||||||
|
"""
|
||||||
|
subprocess.check_call([sys.executable, "-c", prog])
|
||||||
|
@ -7,12 +7,17 @@ import os
|
|||||||
import tempfile
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import blobfile
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def read_file(blobpath: str) -> bytes:
|
def read_file(blobpath: str) -> bytes:
|
||||||
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
|
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
|
||||||
|
try:
|
||||||
|
import blobfile
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"blobfile is not installed. Please install it by running `pip install blobfile`."
|
||||||
|
)
|
||||||
with blobfile.BlobFile(blobpath, "rb") as f:
|
with blobfile.BlobFile(blobpath, "rb") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
|
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
|
||||||
@ -93,6 +98,12 @@ def data_gym_to_mergeable_bpe_ranks(
|
|||||||
|
|
||||||
|
|
||||||
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
|
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
|
||||||
|
try:
|
||||||
|
import blobfile
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"blobfile is not installed. Please install it by running `pip install blobfile`."
|
||||||
|
)
|
||||||
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
|
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
|
||||||
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
|
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
|
||||||
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
|
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user