From 3e8620030c68d2fd6d4ec6d38426e7a1983661f5 Mon Sep 17 00:00:00 2001 From: Shantanu Jain Date: Sun, 12 Mar 2023 22:01:24 -0700 Subject: [PATCH] Bump version, sync codebase --- CHANGELOG.md | 8 ++++++++ Cargo.toml | 2 +- pyproject.toml | 5 +++-- src/lib.rs | 27 +++++++++------------------ tests/test_simple_public.py | 12 ++++++++++++ tiktoken/load.py | 13 ++++++++++++- 6 files changed, 45 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2790421..a7dce9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ This is the changelog for the open source version of tiktoken. +## [v0.3.1] +- Build aarch64 wheels +- Make `blobfile` an optional dependency + +Thank you to @messense for the environment variable that makes cargo not OOM under emulation! + ## [v0.3.0] - Improve performance by 5-20%; thank you to @nistath! - Add `gpt-3.5-turbo` models to `encoding_for_model` @@ -14,6 +20,8 @@ This is the changelog for the open source version of tiktoken. - Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model - Improve portability of caching logic +Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections + ## [v0.1.2] - Avoid use of `blobfile` for public files - Add support for Python 3.8 diff --git a/Cargo.toml b/Cargo.toml index 40a72b9..912af00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.3.0" +version = "0.3.1" edition = "2021" rust-version = "1.57.0" diff --git a/pyproject.toml b/pyproject.toml index 791e3c7..1834ef1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,12 @@ [project] name = "tiktoken" -version = "0.3.0" +version = "0.3.1" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}] -dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"] +dependencies = ["regex>=2022.1.18", "requests>=2.26.0"] +optional-dependencies = {blobfile = ["blobfile>=2"]} requires-python = ">=3.8" [project.urls] diff --git a/src/lib.rs b/src/lib.rs index b44d9c8..f391005 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,32 +21,23 @@ fn _byte_pair_merge( // The rank of the last item in the vector is not a valid value. let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect(); - // NOTE: using a macro here because a closure fails to get inlined - // according to optimization remarks. - // A closure also cannot capture a reference to `piece` without - // the borrow checker complaining about the mutable borrows during - // the assignments later in this code. - macro_rules! get_rank { - ($start_idx:expr, $skip:expr) => {{ - let start_idx: usize = $start_idx; - let skip: usize = $skip; + let get_rank = { + #[inline(always)] + |parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| { if (start_idx + skip + 2) < parts.len() { ranks .get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0]) - .map(|r| *r) + .copied() } else { None } - }}; - ($idx:expr) => {{ - get_rank!($idx, 0) - }}; - } + } + }; // We look up the ranks once in the beggining and iteratively update // them during each merge, which reduces the number of rank lookups. for i in 0..parts.len() - 2 { - match get_rank!(i) { + match get_rank(&parts, i, 0) { Some(rank) => { // usize::MAX is a sentinel value and cannot be a valid rank debug_assert!(rank != usize::MAX); @@ -89,9 +80,9 @@ fn _byte_pair_merge( // parts[i] and parts[i-1] before removing, which could thrash // the cache. Thus, we update the rank calculation by skipping over // parts[i + 1], by invoking `get_rank!` with `skip = 1`. - parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX); + parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX); if i > 0 { - parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX); + parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX); } parts.remove(i + 1); diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py index a7d70b5..8458c12 100644 --- a/tests/test_simple_public.py +++ b/tests/test_simple_public.py @@ -1,3 +1,6 @@ +import subprocess +import sys + import tiktoken @@ -28,3 +31,12 @@ def test_encoding_for_model(): assert enc.name == "p50k_edit" enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") assert enc.name == "cl100k_base" + + +def test_optional_blobfile_dependency(): + prog = """ +import tiktoken +import sys +assert "blobfile" not in sys.modules +""" + subprocess.check_call([sys.executable, "-c", prog]) diff --git a/tiktoken/load.py b/tiktoken/load.py index c588106..4a49ae4 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -7,12 +7,17 @@ import os import tempfile import uuid -import blobfile import requests def read_file(blobpath: str) -> bytes: if not blobpath.startswith("http://") and not blobpath.startswith("https://"): + try: + import blobfile + except ImportError: + raise ImportError( + "blobfile is not installed. Please install it by running `pip install blobfile`." + ) with blobfile.BlobFile(blobpath, "rb") as f: return f.read() # avoiding blobfile for public files helps avoid auth issues, like MFA prompts @@ -93,6 +98,12 @@ def data_gym_to_mergeable_bpe_ranks( def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None: + try: + import blobfile + except ImportError: + raise ImportError( + "blobfile is not installed. Please install it by running `pip install blobfile`." + ) with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f: for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]): f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")