From 85a4f9dbb0f01933dc78b05aafa991b79f2c5ba3 Mon Sep 17 00:00:00 2001 From: Gabriel Tofvesson Date: Wed, 22 Mar 2023 00:07:38 +0100 Subject: [PATCH] Remove python --- .github/workflows/build_wheels.yml | 83 ------- CHANGELOG.md | 35 --- Cargo.toml | 9 +- MANIFEST.in | 8 - perf.svg | 374 ----------------------------- pyproject.toml | 41 ---- scripts/benchmark.py | 39 --- scripts/redact.py | 67 ------ setup.py | 18 -- src/lib.rs | 308 ++++++++++-------------- tests/test_simple_public.py | 42 ---- tiktoken/__init__.py | 4 - tiktoken/core.py | 329 ------------------------- tiktoken/load.py | 118 --------- tiktoken/model.py | 75 ------ tiktoken/py.typed | 0 tiktoken/registry.py | 73 ------ tiktoken_ext/openai_public.py | 88 ------- 18 files changed, 135 insertions(+), 1576 deletions(-) delete mode 100644 .github/workflows/build_wheels.yml delete mode 100644 CHANGELOG.md delete mode 100644 MANIFEST.in delete mode 100644 perf.svg delete mode 100644 pyproject.toml delete mode 100644 scripts/benchmark.py delete mode 100644 scripts/redact.py delete mode 100644 setup.py delete mode 100644 tests/test_simple_public.py delete mode 100644 tiktoken/__init__.py delete mode 100644 tiktoken/core.py delete mode 100644 tiktoken/load.py delete mode 100644 tiktoken/model.py delete mode 100644 tiktoken/py.typed delete mode 100644 tiktoken/registry.py delete mode 100644 tiktoken_ext/openai_public.py diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml deleted file mode 100644 index cd0cddf..0000000 --- a/.github/workflows/build_wheels.yml +++ /dev/null @@ -1,83 +0,0 @@ -name: Build wheels - -on: [push, pull_request, workflow_dispatch] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build_wheels: - name: py${{ matrix.python-version }} on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - # cibuildwheel builds linux wheels inside a manylinux container - # it also takes care of procuring the correct python version for us - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: [38, 39, 310, 311] - - steps: - - uses: actions/checkout@v3 - - - uses: pypa/cibuildwheel@v2.11.3 - env: - CIBW_BUILD: "cp${{ matrix.python-version}}-*" - - - uses: actions/upload-artifact@v3 - with: - name: dist - path: ./wheelhouse/*.whl - - build_wheels_aarch64: - name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: [38, 39, 310, 311] - - steps: - - uses: actions/checkout@v3 - - - name: Setup up QEMU - uses: docker/setup-qemu-action@v2 - with: - platforms: arm64 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.11.3 - env: - CIBW_BUILD: "cp${{ matrix.python-version}}-*" - CIBW_ARCHS: aarch64 - CIBW_BUILD_VERBOSITY: 3 - # https://github.com/rust-lang/cargo/issues/10583 - CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true - - uses: actions/upload-artifact@v3 - with: - name: dist - path: ./wheelhouse/*.whl - - build_sdist: - name: sdist - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - name: Install Python - with: - python-version: "3.9" - - name: Run check-manifest - run: | - pip install check-manifest - check-manifest -v - - name: Build sdist - run: | - pip install --upgrade build - python -m build --sdist - - uses: actions/upload-artifact@v3 - with: - name: dist - path: ./dist/*.tar.gz diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index d0365b8..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,35 +0,0 @@ -# Changelog - -This is the changelog for the open source version of tiktoken. - -## [v0.3.2] -- Add encoding for GPT-4 - -## [v0.3.1] -- Build aarch64 wheels -- Make `blobfile` an optional dependency - -Thank you to @messense for the environment variable that makes cargo not OOM under emulation! - -## [v0.3.0] -- Improve performance by 5-20%; thank you to @nistath! -- Add `gpt-3.5-turbo` models to `encoding_for_model` -- Add prefix matching to `encoding_for_model` to better support future model versions -- Fix a bug in the README instructions on extending tiktoken -- Update the set of available encodings -- Add packaging metadata - -## [v0.2.0] -- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model -- Improve portability of caching logic - -Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections - -## [v0.1.2] -- Avoid use of `blobfile` for public files -- Add support for Python 3.8 -- Add py.typed -- Improve the public tests - -## [v0.1.1] -- Initial release diff --git a/Cargo.toml b/Cargo.toml index 07182cd..fc3cddb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,17 +5,14 @@ edition = "2021" rust-version = "1.57.0" [lib] -name = "_tiktoken" -crate-type = ["cdylib"] +name = "tiktoken" [dependencies] -pyo3 = { version = "0.17.3", features = ["extension-module"] } - -# tiktoken dependencies -fancy-regex = "0.10.0" +fancy-regex = "0.11.0" regex = "1.7.0" rustc-hash = "1.1.0" bstr = "1.0.1" +anyhow = "1.0.70" [profile.release] incremental = true diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 7f25b27..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,8 +0,0 @@ -include *.svg -include *.toml -include *.md -include Makefile -global-include py.typed -recursive-include scripts *.py -recursive-include tests *.py -recursive-include src *.rs diff --git a/perf.svg b/perf.svg deleted file mode 100644 index 723036c..0000000 --- a/perf.svg +++ /dev/null @@ -1,374 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -Throughput - - - - - - -0 MB/s - - - - - -10 MB/s - - - - - -20 MB/s - - - - - -30 MB/s - - - - - -40 MB/s - - - - - - - - - - - -Thread count - - - - - -1 - - - - - -2 - - - - - -4 - - - - - -8 - - - - - -16 - - - - - -32 - - - - - -64 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -tiktoken - -huggingface - - - - diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 739d295..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,41 +0,0 @@ -[project] -name = "tiktoken" -version = "0.3.2" -description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" -readme = "README.md" -license = {file = "LICENSE"} -authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}] -dependencies = ["regex>=2022.1.18", "requests>=2.26.0"] -optional-dependencies = {blobfile = ["blobfile>=2"]} -requires-python = ">=3.8" - -[project.urls] -homepage = "https://github.com/openai/tiktoken" -repository = "https://github.com/openai/tiktoken" -changelog = "https://github.com/openai/tiktoken/blob/main/CHANGELOG.md" - -[build-system] -build-backend = "setuptools.build_meta" -requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"] - -[tool.cibuildwheel] -build-frontend = "build" -build-verbosity = 1 - -linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" -linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" } -macos.before-all = "rustup target add aarch64-apple-darwin" - -skip = [ - "*-manylinux_i686", - "*-musllinux_i686", - "*-win32", -] -macos.archs = ["x86_64", "arm64"] -# When cross-compiling on Intel, it is not possible to test arm64 wheels. -# Warnings will be silenced with following CIBW_TEST_SKIP -test-skip = "*-macosx_arm64" - -before-test = "pip install pytest" -test-command = "pytest {project}/tests" - diff --git a/scripts/benchmark.py b/scripts/benchmark.py deleted file mode 100644 index 4d679fa..0000000 --- a/scripts/benchmark.py +++ /dev/null @@ -1,39 +0,0 @@ -import base64 -import functools -import gzip -import json -import os -import random -import time -from typing import Any, cast - -import blobfile - -import tiktoken - - -def benchmark_batch(documents: list[str]) -> None: - num_threads = int(os.environ["RAYON_NUM_THREADS"]) - num_bytes = sum(map(len, map(str.encode, documents))) - print(f"num_threads: {num_threads}, num_bytes: {num_bytes}") - - enc = tiktoken.get_encoding("gpt2") - enc.encode("warmup") - - start = time.perf_counter_ns() - enc.encode_ordinary_batch(documents, num_threads=num_threads) - end = time.perf_counter_ns() - print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s") - - import transformers - - hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2") - hf_enc.model_max_length = 1e30 # silence! - hf_enc.encode("warmup") - - start = time.perf_counter_ns() - hf_enc(documents) - end = time.perf_counter_ns() - print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s") - - diff --git a/scripts/redact.py b/scripts/redact.py deleted file mode 100644 index d82db32..0000000 --- a/scripts/redact.py +++ /dev/null @@ -1,67 +0,0 @@ -import argparse -import re -import subprocess -from pathlib import Path - - -def redact_file(path: Path, dry_run: bool) -> None: - if not path.exists() or path.is_dir(): - return - - text = path.read_text() - if not text: - return - - first_line = text.splitlines()[0] - if "redact" in first_line: - if not dry_run: - path.unlink() - print(f"Deleted {path}") - return - - pattern = "|".join( - re.escape(x) - for x in [ - "# ===== redact-beg =====\n", - "# ===== redact-end =====\n", - "\n", - "\n", - ] - ) - - if re.search(pattern, text): - redacted_text = "".join(re.split(pattern, text)[::2]) - if not dry_run: - path.write_text(redacted_text) - print(f"Redacted {path}") - return - - print(f"Skipped {path}") - - -def redact(dry_run: bool) -> None: - tiktoken_root = Path(__file__).parent.parent - assert tiktoken_root.name == "tiktoken" - assert (tiktoken_root / "pyproject.toml").exists() - - try: - output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True) - paths = [Path(p) for p in output.splitlines()] - except subprocess.CalledProcessError: - paths = list(tiktoken_root.glob("**/*")) - - for path in paths: - redact_file(path, dry_run=dry_run) - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True) - args = parser.parse_args() - redact(args.dry_run) - if args.dry_run: - print("Dry run, use --dry-run=false to actually redact files") - - -if __name__ == "__main__": - main() diff --git a/setup.py b/setup.py deleted file mode 100644 index a22e8e5..0000000 --- a/setup.py +++ /dev/null @@ -1,18 +0,0 @@ -from setuptools import setup -from setuptools_rust import Binding, RustExtension - -setup( - name="tiktoken", - rust_extensions=[ - RustExtension( - "tiktoken._tiktoken", - binding=Binding.PyO3, - # Between our use of editable installs and wanting to use Rust for performance sensitive - # code, it makes sense to just always use --release - debug=False, - ) - ], - package_data={"tiktoken": ["py.typed"]}, - packages=["tiktoken", "tiktoken_ext"], - zip_safe=False, -) diff --git a/src/lib.rs b/src/lib.rs index 70009d2..d202a4b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,10 +5,6 @@ use std::collections::HashSet; use std::thread; use fancy_regex::Regex; -use pyo3::exceptions; -use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyList, PyTuple}; -use pyo3::PyResult; use rustc_hash::FxHashMap as HashMap; fn _byte_pair_merge( @@ -169,7 +165,6 @@ fn hash_current_thread() -> usize { } const MAX_NUM_THREADS: usize = 128; -#[pyclass] struct CoreBPE { encoder: HashMap, usize>, special_tokens_encoder: HashMap, @@ -192,19 +187,96 @@ impl CoreBPE { &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS] } - fn _decode_native(&self, tokens: &[usize]) -> Vec { - let mut ret = Vec::with_capacity(tokens.len() * 2); - for token in tokens { - let token_bytes = self - .decoder - .get(token) - .unwrap_or_else(|| &self.special_tokens_decoder[token]); - ret.extend(token_bytes); + fn _increase_last_piece_token_len( + &self, + tokens: Vec, + mut last_piece_token_len: usize, + ) -> (Vec, usize) { + // Unfortunately, the locations where our regex splits can be unstable. + // For the purposes of determining unstable tokens, unstable regex splitting + // is only a problem if a split that was present disappears, since this can + // lead to merging of tokens otherwise thought to be stable. + // cl100k_base makes our life hard by including the \s*[\r\n]+ + // pattern. This can e.g. cause "\n" + " " to become "\n \n". + // Here is a quick and dirty fix: + { + let token_is_all_space = |token| { + self.decoder + .get(token) + .map(|token_bytes| { + token_bytes + .iter() + .rev() + .all(|&b| [b' ', b'\n', b'\t'].contains(&b)) + }) + .unwrap_or(false) + }; + if last_piece_token_len > 0 + && token_is_all_space(&tokens[tokens.len() - last_piece_token_len]) + { + while (last_piece_token_len < tokens.len()) + && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1]) + { + last_piece_token_len += 1; + } + } } - ret + debug_assert!(last_piece_token_len <= tokens.len()); + + (tokens, last_piece_token_len) + } +} + +impl CoreBPE { + pub fn new( + encoder: HashMap, usize>, + special_tokens_encoder: HashMap, + pattern: &str, + ) -> anyhow::Result { + let regex = Regex::new(pattern) + .map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?; + + let special_regex = { + let _parts = special_tokens_encoder + .keys() + .map(|s| fancy_regex::escape(s)) + .collect::>(); + Regex::new(&_parts.join("|")) + .map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))? + }; + + let decoder: HashMap> = + encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); + + assert!(encoder.len() == decoder.len()); + + let special_tokens_decoder: HashMap> = special_tokens_encoder + .iter() + .map(|(k, v)| (*v, k.as_bytes().to_vec())) + .collect(); + + // Clone because I don't know how to tell Rust I'm not going to change the map + let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); + sorted_token_bytes.sort(); + + Ok(CoreBPE { + encoder, + special_tokens_encoder, + decoder, + special_tokens_decoder, + regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), + special_regex_tls: (0..MAX_NUM_THREADS) + .map(|_| special_regex.clone()) + .collect(), + sorted_token_bytes, + }) } - fn _encode_ordinary_native(&self, text: &str) -> Vec { + // ==================== + // Encoding + // ==================== + + pub fn encode_ordinary(&self, text: &str) -> Vec { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) let regex = self._get_tl_regex(); @@ -220,7 +292,7 @@ impl CoreBPE { ret } - fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec, usize) { + pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> (Vec, usize) { let special_regex = self._get_tl_special_regex(); let regex = self._get_tl_regex(); let mut ret = vec![]; @@ -276,51 +348,37 @@ impl CoreBPE { (ret, last_piece_token_len) } - fn _increase_last_piece_token_len( - &self, - tokens: Vec, - mut last_piece_token_len: usize, - ) -> (Vec, usize) { - // Unfortunately, the locations where our regex splits can be unstable. - // For the purposes of determining unstable tokens, unstable regex splitting - // is only a problem if a split that was present disappears, since this can - // lead to merging of tokens otherwise thought to be stable. - // cl100k_base makes our life hard by including the \s*[\r\n]+ - // pattern. This can e.g. cause "\n" + " " to become "\n \n". - // Here is a quick and dirty fix: - { - let token_is_all_space = |token| { - self.decoder - .get(token) - .map(|token_bytes| { - token_bytes - .iter() - .rev() - .all(|&b| [b' ', b'\n', b'\t'].contains(&b)) - }) - .unwrap_or(false) - }; - if last_piece_token_len > 0 - && token_is_all_space(&tokens[tokens.len() - last_piece_token_len]) - { - while (last_piece_token_len < tokens.len()) - && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1]) - { - last_piece_token_len += 1; + fn _encode_bytes(&self, bytes: &[u8]) -> Vec { + match std::str::from_utf8(bytes) { + Ok(text) => self.encode_ordinary(text), + Err(e) => { + let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; + let (tokens, last_piece_token_len) = self.encode(text, HashSet::new()); + let (mut tokens, last_piece_token_len) = + self._increase_last_piece_token_len(tokens, last_piece_token_len); + if !tokens.is_empty() && last_piece_token_len > 0 { + // Lop off the tokens from the last piece and run BPE on the remaining bytes + // Somewhat niche, but this may not be correct if we'd have had a regex + // split between the valid UTF-8 and the invalid bytes, which is why this + // method is private + let mut unstable_bytes = + self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]); + unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); + + tokens.truncate(tokens.len() - last_piece_token_len); + tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder)); } + tokens } } - debug_assert!(last_piece_token_len <= tokens.len()); - - (tokens, last_piece_token_len) } - fn _encode_unstable_native( + pub fn encode_with_unstable( &self, text: &str, - allowed_special: &HashSet<&str>, + allowed_special: HashSet<&str>, ) -> (Vec, HashSet>) { - let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special); + let (tokens, last_piece_token_len) = self.encode(text, allowed_special); if last_piece_token_len == 0 { // If last_piece_token_len is zero, the last token was a special token and we have // no unstable bytes @@ -329,7 +387,7 @@ impl CoreBPE { let (mut tokens, last_piece_token_len) = self._increase_last_piece_token_len(tokens, last_piece_token_len); - let unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); + let unstable_bytes = self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]); tokens.truncate(tokens.len() - last_piece_token_len); // TODO: we should try harder to find additional stable tokens @@ -377,7 +435,7 @@ impl CoreBPE { // So convert to UTF-8 and do regex splitting. // E.g. with cl100k_base " !" gets split to " " + " !", // but byte_pair_encode(" !") != byte_pair_encode(" ") - Ok(s) => self._encode_ordinary_native(s), + Ok(s) => self.encode_ordinary(s), // Technically, whether or not this arm is correct depends on whether there // would be a regex split before the UTF-8 truncation point. @@ -430,108 +488,8 @@ impl CoreBPE { (tokens, completions) } -} -#[pymethods] -impl CoreBPE { - #[new] - fn new( - encoder: HashMap, usize>, - special_tokens_encoder: HashMap, - pattern: &str, - ) -> PyResult { - let regex = Regex::new(pattern) - .map_err(|e| PyErr::new::(e.to_string()))?; - - let special_regex = { - let _parts = special_tokens_encoder - .keys() - .map(|s| fancy_regex::escape(s)) - .collect::>(); - Regex::new(&_parts.join("|")) - .map_err(|e| PyErr::new::(e.to_string()))? - }; - - let decoder: HashMap> = - encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); - - assert!(encoder.len() == decoder.len()); - - let special_tokens_decoder: HashMap> = special_tokens_encoder - .iter() - .map(|(k, v)| (*v, k.as_bytes().to_vec())) - .collect(); - - // Clone because I don't know how to tell Rust I'm not going to change the map - let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); - sorted_token_bytes.sort(); - - Ok(CoreBPE { - encoder, - special_tokens_encoder, - decoder, - special_tokens_decoder, - regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), - special_regex_tls: (0..MAX_NUM_THREADS) - .map(|_| special_regex.clone()) - .collect(), - sorted_token_bytes, - }) - } - - // ==================== - // Encoding - // ==================== - - fn encode_ordinary(&self, py: Python, text: &str) -> Vec { - py.allow_threads(|| self._encode_ordinary_native(text)) - } - - fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec { - py.allow_threads(|| self._encode_native(text, &allowed_special).0) - } - - fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec { - py.allow_threads(|| { - match std::str::from_utf8(bytes) { - Ok(text) => self._encode_ordinary_native(text), - Err(e) => { - let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; - let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new()); - let (mut tokens, last_piece_token_len) = - self._increase_last_piece_token_len(tokens, last_piece_token_len); - if !tokens.is_empty() && last_piece_token_len > 0 { - // Lop off the tokens from the last piece and run BPE on the remaining bytes - // Somewhat niche, but this may not be correct if we'd have had a regex - // split between the valid UTF-8 and the invalid bytes, which is why this - // method is private - let mut unstable_bytes = - self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); - unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); - - tokens.truncate(tokens.len() - last_piece_token_len); - tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder)); - } - tokens - } - } - }) - } - - fn encode_with_unstable( - &self, - py: Python, - text: &str, - allowed_special: HashSet<&str>, - ) -> Py { - let (tokens, completions) = - py.allow_threads(|| self._encode_unstable_native(text, &allowed_special)); - let py_completions = - PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..]))); - (tokens, py_completions).into_py(py) - } - - fn encode_single_token(&self, piece: &[u8]) -> PyResult { + pub fn encode_single_token(&self, piece: &[u8]) -> anyhow::Result { if let Some(token) = self.encoder.get(piece).copied() { return Ok(token); } @@ -540,10 +498,10 @@ impl CoreBPE { return Ok(token); } } - Err(PyErr::new::(piece.to_owned())) + Err(anyhow::anyhow!("Piece {:?} not found", piece)) } - fn encode_single_piece(&self, piece: &[u8]) -> Vec { + pub fn encode_single_piece(&self, piece: &[u8]) -> Vec { if let Some(token) = self.encoder.get(piece) { return vec![*token]; } @@ -554,39 +512,37 @@ impl CoreBPE { // Decoding // ==================== - fn decode_bytes(&self, py: Python, tokens: Vec) -> Py { - let bytes = py.allow_threads(|| self._decode_native(&tokens)); - PyBytes::new(py, &bytes).into() + pub fn decode_bytes(&self, tokens: &[usize]) -> Vec { + let mut ret = Vec::with_capacity(tokens.len() * 2); + for token in tokens { + let token_bytes = self + .decoder + .get(token) + .unwrap_or_else(|| &self.special_tokens_decoder[token]); + ret.extend(token_bytes); + } + ret } - fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult> { + pub fn decode_single_token_bytes(&self, token: usize) -> anyhow::Result<&Vec> { if let Some(bytes) = self.decoder.get(&token) { - return Ok(PyBytes::new(py, bytes).into()); + return Ok(bytes); } if let Some(bytes) = self.special_tokens_decoder.get(&token) { - return Ok(PyBytes::new(py, bytes).into()); + return Ok(bytes); } - Err(PyErr::new::(token.to_string())) + Err(anyhow::anyhow!("Token {} not found", token)) } // ==================== // Miscellaneous // ==================== - fn token_byte_values(&self, py: Python) -> Vec> { - self.sorted_token_bytes - .iter() - .map(|x| PyBytes::new(py, x).into()) - .collect() + pub fn token_byte_values(&self) -> &Vec> { + &self.sorted_token_bytes } } -#[pymodule] -fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - Ok(()) -} - #[cfg(test)] mod tests { use rustc_hash::FxHashMap as HashMap; diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py deleted file mode 100644 index 8458c12..0000000 --- a/tests/test_simple_public.py +++ /dev/null @@ -1,42 +0,0 @@ -import subprocess -import sys - -import tiktoken - - -def test_simple(): - # Note that there are more actual tests, they're just not currently public :-) - enc = tiktoken.get_encoding("gpt2") - assert enc.encode("hello world") == [31373, 995] - assert enc.decode([31373, 995]) == "hello world" - assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] - - enc = tiktoken.get_encoding("cl100k_base") - assert enc.encode("hello world") == [15339, 1917] - assert enc.decode([15339, 1917]) == "hello world" - assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] - - for enc_name in tiktoken.list_encoding_names(): - enc = tiktoken.get_encoding(enc_name) - for token in range(10_000): - assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token - - -def test_encoding_for_model(): - enc = tiktoken.encoding_for_model("gpt2") - assert enc.name == "gpt2" - enc = tiktoken.encoding_for_model("text-davinci-003") - assert enc.name == "p50k_base" - enc = tiktoken.encoding_for_model("text-davinci-edit-001") - assert enc.name == "p50k_edit" - enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") - assert enc.name == "cl100k_base" - - -def test_optional_blobfile_dependency(): - prog = """ -import tiktoken -import sys -assert "blobfile" not in sys.modules -""" - subprocess.check_call([sys.executable, "-c", prog]) diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py deleted file mode 100644 index 9ad09a3..0000000 --- a/tiktoken/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .core import Encoding as Encoding -from .model import encoding_for_model as encoding_for_model -from .registry import get_encoding as get_encoding -from .registry import list_encoding_names as list_encoding_names diff --git a/tiktoken/core.py b/tiktoken/core.py deleted file mode 100644 index 05613aa..0000000 --- a/tiktoken/core.py +++ /dev/null @@ -1,329 +0,0 @@ -from __future__ import annotations - -import functools -from concurrent.futures import ThreadPoolExecutor -from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union - -import regex - -from tiktoken import _tiktoken - - -class Encoding: - def __init__( - self, - name: str, - *, - pat_str: str, - mergeable_ranks: dict[bytes, int], - special_tokens: dict[str, int], - explicit_n_vocab: Optional[int] = None, - ): - """Creates an Encoding object. - - See openai_public.py for examples of how to construct an Encoding object. - - Args: - name: The name of the encoding. It should be clear from the name of the encoding - what behaviour to expect, in particular, encodings with different special tokens - should have different names. - pat_str: A regex pattern string that is used to split the input text. - mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks - must correspond to merge priority. - special_tokens: A dictionary mapping special token strings to their token values. - explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked - that the number of mergeable tokens and special tokens is equal to this number. - """ - self.name = name - - self._pat_str = pat_str - self._mergeable_ranks = mergeable_ranks - self._special_tokens = special_tokens - - self.max_token_value = max( - max(mergeable_ranks.values()), max(special_tokens.values(), default=0) - ) - if explicit_n_vocab: - assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab - assert self.max_token_value == explicit_n_vocab - 1 - - self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str) - - def __repr__(self) -> str: - return f"" - - # ==================== - # Encoding - # ==================== - - def encode_ordinary(self, text: str) -> list[int]: - """Encodes a string into tokens, ignoring special tokens. - - This is equivalent to `encode(text, disallowed_special=())` (but slightly faster). - - ``` - >>> enc.encode_ordinary("hello world") - [31373, 995] - """ - return self._core_bpe.encode_ordinary(text) - - def encode( - self, - text: str, - *, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> list[int]: - """Encodes a string into tokens. - - Special tokens are artificial tokens used to unlock capabilities from a model, - such as fill-in-the-middle. So we want to be careful about accidentally encoding special - tokens, since they can be used to trick a model into doing something we don't want it to do. - - Hence, by default, encode will raise an error if it encounters text that corresponds - to a special token. This can be controlled on a per-token level using the `allowed_special` - and `disallowed_special` parameters. In particular: - - Setting `disallowed_special` to () will prevent this function from raising errors and - cause all text corresponding to special tokens to be encoded as natural text. - - Setting `allowed_special` to "all" will cause this function to treat all text - corresponding to special tokens to be encoded as special tokens. - - ``` - >>> enc.encode("hello world") - [31373, 995] - >>> enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}) - [50256] - >>> enc.encode("<|endoftext|>", allowed_special="all") - [50256] - >>> enc.encode("<|endoftext|>") - # Raises ValueError - >>> enc.encode("<|endoftext|>", disallowed_special=()) - [27, 91, 437, 1659, 5239, 91, 29] - ``` - """ - if allowed_special == "all": - allowed_special = self.special_tokens_set - if disallowed_special == "all": - disallowed_special = self.special_tokens_set - allowed_special - if disallowed_special: - if not isinstance(disallowed_special, frozenset): - disallowed_special = frozenset(disallowed_special) - if match := _special_token_regex(disallowed_special).search(text): - raise_disallowed_special_token(match.group()) - - return self._core_bpe.encode(text, allowed_special) - - def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: - """Encodes a list of strings into tokens, in parallel, ignoring special tokens. - - This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster). - - ``` - >>> enc.encode_ordinary_batch(["hello world", "goodbye world"]) - [[31373, 995], [11274, 16390, 995]] - ``` - """ - encoder = functools.partial(self.encode_ordinary) - with ThreadPoolExecutor(num_threads) as e: - return list(e.map(encoder, text)) - - def encode_batch( - self, - text: list[str], - *, - num_threads: int = 8, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> list[list[int]]: - """Encodes a list of strings into tokens, in parallel. - - See `encode` for more details on `allowed_special` and `disallowed_special`. - - ``` - >>> enc.encode_batch(["hello world", "goodbye world"]) - [[31373, 995], [11274, 16390, 995]] - ``` - """ - if allowed_special == "all": - allowed_special = self.special_tokens_set - if disallowed_special == "all": - disallowed_special = self.special_tokens_set - allowed_special - if not isinstance(disallowed_special, frozenset): - disallowed_special = frozenset(disallowed_special) - - encoder = functools.partial( - self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special - ) - with ThreadPoolExecutor(num_threads) as e: - return list(e.map(encoder, text)) - - def encode_with_unstable( - self, - text: str, - *, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 - disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> tuple[list[int], list[list[int]]]: - """Encodes a string into stable tokens and possible completion sequences. - - Note that the stable tokens will only represent a substring of `text`. - - See `encode` for more details on `allowed_special` and `disallowed_special`. - - This API should itself be considered unstable. - - ``` - >>> enc.encode_with_unstable("hello fanta") - ([31373], [(277, 4910), (5113, 265), ..., (8842,)]) - - >>> text = "..." - >>> stable_tokens, completions = enc.encode_with_unstable(text) - >>> assert text.encode().startswith(enc.decode_bytes(stable_tokens)) - >>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions) - ``` - """ - if allowed_special == "all": - allowed_special = self.special_tokens_set - if disallowed_special == "all": - disallowed_special = self.special_tokens_set - allowed_special - if disallowed_special: - if not isinstance(disallowed_special, frozenset): - disallowed_special = frozenset(disallowed_special) - if match := _special_token_regex(disallowed_special).search(text): - raise_disallowed_special_token(match.group()) - - return self._core_bpe.encode_with_unstable(text, allowed_special) - - def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int: - """Encodes text corresponding to a single token to its token value. - - NOTE: this will encode all special tokens. - - Raises `KeyError` if the token is not in the vocabulary. - - ``` - >>> enc.encode_single_token("hello") - 31373 - ``` - """ - if isinstance(text_or_bytes, str): - text_or_bytes = text_or_bytes.encode("utf-8") - return self._core_bpe.encode_single_token(text_or_bytes) - - # ==================== - # Decoding - # ==================== - - def decode_bytes(self, tokens: list[int]) -> bytes: - """Decodes a list of tokens into bytes. - - ``` - >>> enc.decode_bytes([31373, 995]) - b'hello world' - ``` - """ - return self._core_bpe.decode_bytes(tokens) - - def decode(self, tokens: list[int], errors: str = "replace") -> str: - """Decodes a list of tokens into a string. - - WARNING: the default behaviour of this function is lossy, since decoded bytes are not - guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter, - for instance, setting `errors=strict`. - - ``` - >>> enc.decode([31373, 995]) - 'hello world' - ``` - """ - return self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors) - - def decode_single_token_bytes(self, token: int) -> bytes: - """Decodes a token into bytes. - - NOTE: this will decode all special tokens. - - Raises `KeyError` if the token is not in the vocabulary. - - ``` - >>> enc.decode_single_token_bytes(31373) - b'hello' - ``` - """ - return self._core_bpe.decode_single_token_bytes(token) - - def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]: - """Decodes a list of tokens into a list of bytes. - - Useful for visualising tokenisation. - >>> enc.decode_tokens_bytes([31373, 995]) - [b'hello', b' world'] - """ - return [self.decode_single_token_bytes(token) for token in tokens] - - # ==================== - # Miscellaneous - # ==================== - - def token_byte_values(self) -> list[bytes]: - """Returns the list of all token byte values.""" - return self._core_bpe.token_byte_values() - - @property - def eot_token(self) -> int: - return self._special_tokens["<|endoftext|>"] - - @functools.cached_property - def special_tokens_set(self) -> set[str]: - return set(self._special_tokens.keys()) - - @property - def n_vocab(self) -> int: - """For backwards compatibility. Prefer to use `enc.max_token_value + 1`.""" - return self.max_token_value + 1 - - # ==================== - # Private - # ==================== - - def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]: - """Encodes text corresponding to bytes without a regex split. - - NOTE: this will not encode any special tokens. - - ``` - >>> enc.encode_single_piece("helloqqqq") - [31373, 38227, 38227] - ``` - """ - if isinstance(text_or_bytes, str): - text_or_bytes = text_or_bytes.encode("utf-8") - return self._core_bpe.encode_single_piece(text_or_bytes) - - def _encode_only_native_bpe(self, text: str) -> list[int]: - """Encodes a string into tokens, but do regex splitting in Python.""" - _unused_pat = regex.compile(self._pat_str) - ret = [] - for piece in regex.findall(_unused_pat, text): - ret.extend(self._core_bpe.encode_single_piece(piece)) - return ret - - def _encode_bytes(self, text: bytes) -> list[int]: - return self._core_bpe._encode_bytes(text) - - -@functools.lru_cache(maxsize=128) -def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]": - inner = "|".join(regex.escape(token) for token in tokens) - return regex.compile(f"({inner})") - - -def raise_disallowed_special_token(token: str) -> NoReturn: - raise ValueError( - f"Encountered text corresponding to disallowed special token {token!r}.\n" - "If you want this text to be encoded as a special token, " - f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n" - f"If you want this text to be encoded as normal text, disable the check for this token " - f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n" - "To disable this check for all special tokens, pass `disallowed_special=()`.\n" - ) diff --git a/tiktoken/load.py b/tiktoken/load.py deleted file mode 100644 index 4a49ae4..0000000 --- a/tiktoken/load.py +++ /dev/null @@ -1,118 +0,0 @@ -from __future__ import annotations - -import base64 -import hashlib -import json -import os -import tempfile -import uuid - -import requests - - -def read_file(blobpath: str) -> bytes: - if not blobpath.startswith("http://") and not blobpath.startswith("https://"): - try: - import blobfile - except ImportError: - raise ImportError( - "blobfile is not installed. Please install it by running `pip install blobfile`." - ) - with blobfile.BlobFile(blobpath, "rb") as f: - return f.read() - # avoiding blobfile for public files helps avoid auth issues, like MFA prompts - return requests.get(blobpath).content - - -def read_file_cached(blobpath: str) -> bytes: - if "TIKTOKEN_CACHE_DIR" in os.environ: - cache_dir = os.environ["TIKTOKEN_CACHE_DIR"] - elif "DATA_GYM_CACHE_DIR" in os.environ: - cache_dir = os.environ["DATA_GYM_CACHE_DIR"] - else: - cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache") - - if cache_dir == "": - # disable caching - return read_file(blobpath) - - cache_key = hashlib.sha1(blobpath.encode()).hexdigest() - - cache_path = os.path.join(cache_dir, cache_key) - if os.path.exists(cache_path): - with open(cache_path, "rb") as f: - return f.read() - - contents = read_file(blobpath) - - os.makedirs(cache_dir, exist_ok=True) - tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp" - with open(tmp_filename, "wb") as f: - f.write(contents) - os.rename(tmp_filename, cache_path) - - return contents - - -def data_gym_to_mergeable_bpe_ranks( - vocab_bpe_file: str, encoder_json_file: str -) -> dict[bytes, int]: - # NB: do not add caching to this function - rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "] - - data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte} - n = 0 - for b in range(2**8): - if b not in rank_to_intbyte: - rank_to_intbyte.append(b) - data_gym_byte_to_byte[chr(2**8 + n)] = b - n += 1 - assert len(rank_to_intbyte) == 2**8 - - # vocab_bpe contains the merges along with associated ranks - vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode() - bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]] - - def decode_data_gym(value: str) -> bytes: - return bytes(data_gym_byte_to_byte[b] for b in value) - - # add the single byte tokens - bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)} - # add the merged tokens - n = len(bpe_ranks) - for first, second in bpe_merges: - bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n - n += 1 - - # check that the encoder file matches the merges file - # this sanity check is important since tiktoken assumes that ranks are ordered the same - # as merge priority - encoder_json = json.loads(read_file_cached(encoder_json_file)) - encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()} - # drop these two special tokens if present, since they're not mergeable bpe tokens - encoder_json_loaded.pop(b"<|endoftext|>", None) - encoder_json_loaded.pop(b"<|startoftext|>", None) - assert bpe_ranks == encoder_json_loaded - - return bpe_ranks - - -def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None: - try: - import blobfile - except ImportError: - raise ImportError( - "blobfile is not installed. Please install it by running `pip install blobfile`." - ) - with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f: - for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]): - f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n") - - -def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]: - # NB: do not add caching to this function - contents = read_file_cached(tiktoken_bpe_file) - return { - base64.b64decode(token): int(rank) - for token, rank in (line.split() for line in contents.splitlines() if line) - } diff --git a/tiktoken/model.py b/tiktoken/model.py deleted file mode 100644 index b8af787..0000000 --- a/tiktoken/model.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import annotations - -from .core import Encoding -from .registry import get_encoding - -# TODO: these will likely be replaced by an API endpoint -MODEL_PREFIX_TO_ENCODING: dict[str, str] = { - # chat - "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k - "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. -} - -MODEL_TO_ENCODING: dict[str, str] = { - # chat - "gpt-4": "cl100k_base", - "gpt-3.5-turbo": "cl100k_base", - # text - "text-davinci-003": "p50k_base", - "text-davinci-002": "p50k_base", - "text-davinci-001": "r50k_base", - "text-curie-001": "r50k_base", - "text-babbage-001": "r50k_base", - "text-ada-001": "r50k_base", - "davinci": "r50k_base", - "curie": "r50k_base", - "babbage": "r50k_base", - "ada": "r50k_base", - # code - "code-davinci-002": "p50k_base", - "code-davinci-001": "p50k_base", - "code-cushman-002": "p50k_base", - "code-cushman-001": "p50k_base", - "davinci-codex": "p50k_base", - "cushman-codex": "p50k_base", - # edit - "text-davinci-edit-001": "p50k_edit", - "code-davinci-edit-001": "p50k_edit", - # embeddings - "text-embedding-ada-002": "cl100k_base", - # old embeddings - "text-similarity-davinci-001": "r50k_base", - "text-similarity-curie-001": "r50k_base", - "text-similarity-babbage-001": "r50k_base", - "text-similarity-ada-001": "r50k_base", - "text-search-davinci-doc-001": "r50k_base", - "text-search-curie-doc-001": "r50k_base", - "text-search-babbage-doc-001": "r50k_base", - "text-search-ada-doc-001": "r50k_base", - "code-search-babbage-code-001": "r50k_base", - "code-search-ada-code-001": "r50k_base", - # open source - "gpt2": "gpt2", -} - - -def encoding_for_model(model_name: str) -> Encoding: - """Returns the encoding used by a model.""" - encoding_name = None - if model_name in MODEL_TO_ENCODING: - encoding_name = MODEL_TO_ENCODING[model_name] - else: - # Check if the model matches a known prefix - # Prefix matching avoids needing library updates for every model version release - # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) - for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): - if model_name.startswith(model_prefix): - return get_encoding(model_encoding_name) - - if encoding_name is None: - raise KeyError( - f"Could not automatically map {model_name} to a tokeniser. " - "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect." - ) from None - - return get_encoding(encoding_name) diff --git a/tiktoken/py.typed b/tiktoken/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/tiktoken/registry.py b/tiktoken/registry.py deleted file mode 100644 index 52d8ec2..0000000 --- a/tiktoken/registry.py +++ /dev/null @@ -1,73 +0,0 @@ -from __future__ import annotations - -import importlib -import pkgutil -import threading -from typing import Any, Callable, Optional - -import tiktoken_ext - -from tiktoken.core import Encoding - -_lock = threading.RLock() -ENCODINGS: dict[str, Encoding] = {} -ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None - - -def _find_constructors() -> None: - global ENCODING_CONSTRUCTORS - with _lock: - if ENCODING_CONSTRUCTORS is not None: - return - ENCODING_CONSTRUCTORS = {} - - # tiktoken_ext is a namespace package - # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes - # - we use namespace package pattern so `pkgutil.iter_modules` is fast - # - it's a separate top-level package because namespace subpackages of non-namespace - # packages don't quite do what you want with editable installs - plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".") - - for _, mod_name, _ in plugin_mods: - mod = importlib.import_module(mod_name) - try: - constructors = mod.ENCODING_CONSTRUCTORS - except AttributeError as e: - raise ValueError( - f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS" - ) from e - for enc_name, constructor in constructors.items(): - if enc_name in ENCODING_CONSTRUCTORS: - raise ValueError( - f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}" - ) - ENCODING_CONSTRUCTORS[enc_name] = constructor - - -def get_encoding(encoding_name: str) -> Encoding: - if encoding_name in ENCODINGS: - return ENCODINGS[encoding_name] - - with _lock: - if encoding_name in ENCODINGS: - return ENCODINGS[encoding_name] - - if ENCODING_CONSTRUCTORS is None: - _find_constructors() - assert ENCODING_CONSTRUCTORS is not None - - if encoding_name not in ENCODING_CONSTRUCTORS: - raise ValueError(f"Unknown encoding {encoding_name}") - - constructor = ENCODING_CONSTRUCTORS[encoding_name] - enc = Encoding(**constructor()) - ENCODINGS[encoding_name] = enc - return enc - - -def list_encoding_names() -> list[str]: - with _lock: - if ENCODING_CONSTRUCTORS is None: - _find_constructors() - assert ENCODING_CONSTRUCTORS is not None - return list(ENCODING_CONSTRUCTORS) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py deleted file mode 100644 index 16a6ec5..0000000 --- a/tiktoken_ext/openai_public.py +++ /dev/null @@ -1,88 +0,0 @@ -from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe - -ENDOFTEXT = "<|endoftext|>" -FIM_PREFIX = "<|fim_prefix|>" -FIM_MIDDLE = "<|fim_middle|>" -FIM_SUFFIX = "<|fim_suffix|>" -ENDOFPROMPT = "<|endofprompt|>" - - -def gpt2(): - mergeable_ranks = data_gym_to_mergeable_bpe_ranks( - vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", - encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", - ) - return { - "name": "gpt2", - "explicit_n_vocab": 50257, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {"<|endoftext|>": 50256}, - } - - -def r50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" - ) - return { - "name": "r50k_base", - "explicit_n_vocab": 50257, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {ENDOFTEXT: 50256}, - } - - -def p50k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" - ) - return { - "name": "p50k_base", - "explicit_n_vocab": 50281, - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": {ENDOFTEXT: 50256}, - } - - -def p50k_edit(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" - ) - special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} - return { - "name": "p50k_edit", - "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": special_tokens, - } - - -def cl100k_base(): - mergeable_ranks = load_tiktoken_bpe( - "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" - ) - special_tokens = { - ENDOFTEXT: 100257, - FIM_PREFIX: 100258, - FIM_MIDDLE: 100259, - FIM_SUFFIX: 100260, - ENDOFPROMPT: 100276, - } - return { - "name": "cl100k_base", - "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", - "mergeable_ranks": mergeable_ranks, - "special_tokens": special_tokens, - } - - -ENCODING_CONSTRUCTORS = { - "gpt2": gpt2, - "r50k_base": r50k_base, - "p50k_base": p50k_base, - "p50k_edit": p50k_edit, - "cl100k_base": cl100k_base, -}