diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
deleted file mode 100644
index cd0cddf..0000000
--- a/.github/workflows/build_wheels.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-name: Build wheels
-
-on: [push, pull_request, workflow_dispatch]
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
- cancel-in-progress: true
-
-jobs:
- build_wheels:
- name: py${{ matrix.python-version }} on ${{ matrix.os }}
- runs-on: ${{ matrix.os }}
- strategy:
- fail-fast: false
- matrix:
- # cibuildwheel builds linux wheels inside a manylinux container
- # it also takes care of procuring the correct python version for us
- os: [ubuntu-latest, windows-latest, macos-latest]
- python-version: [38, 39, 310, 311]
-
- steps:
- - uses: actions/checkout@v3
-
- - uses: pypa/cibuildwheel@v2.11.3
- env:
- CIBW_BUILD: "cp${{ matrix.python-version}}-*"
-
- - uses: actions/upload-artifact@v3
- with:
- name: dist
- path: ./wheelhouse/*.whl
-
- build_wheels_aarch64:
- name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64)
- runs-on: ${{ matrix.os }}
- strategy:
- fail-fast: false
- matrix:
- os: [ubuntu-latest]
- python-version: [38, 39, 310, 311]
-
- steps:
- - uses: actions/checkout@v3
-
- - name: Setup up QEMU
- uses: docker/setup-qemu-action@v2
- with:
- platforms: arm64
-
- - name: Build wheels
- uses: pypa/cibuildwheel@v2.11.3
- env:
- CIBW_BUILD: "cp${{ matrix.python-version}}-*"
- CIBW_ARCHS: aarch64
- CIBW_BUILD_VERBOSITY: 3
- # https://github.com/rust-lang/cargo/issues/10583
- CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true
- - uses: actions/upload-artifact@v3
- with:
- name: dist
- path: ./wheelhouse/*.whl
-
- build_sdist:
- name: sdist
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
- name: Install Python
- with:
- python-version: "3.9"
- - name: Run check-manifest
- run: |
- pip install check-manifest
- check-manifest -v
- - name: Build sdist
- run: |
- pip install --upgrade build
- python -m build --sdist
- - uses: actions/upload-artifact@v3
- with:
- name: dist
- path: ./dist/*.tar.gz
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index d0365b8..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Changelog
-
-This is the changelog for the open source version of tiktoken.
-
-## [v0.3.2]
-- Add encoding for GPT-4
-
-## [v0.3.1]
-- Build aarch64 wheels
-- Make `blobfile` an optional dependency
-
-Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
-
-## [v0.3.0]
-- Improve performance by 5-20%; thank you to @nistath!
-- Add `gpt-3.5-turbo` models to `encoding_for_model`
-- Add prefix matching to `encoding_for_model` to better support future model versions
-- Fix a bug in the README instructions on extending tiktoken
-- Update the set of available encodings
-- Add packaging metadata
-
-## [v0.2.0]
-- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
-- Improve portability of caching logic
-
-Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
-
-## [v0.1.2]
-- Avoid use of `blobfile` for public files
-- Add support for Python 3.8
-- Add py.typed
-- Improve the public tests
-
-## [v0.1.1]
-- Initial release
diff --git a/Cargo.toml b/Cargo.toml
index 07182cd..fc3cddb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,17 +5,14 @@ edition = "2021"
rust-version = "1.57.0"
[lib]
-name = "_tiktoken"
-crate-type = ["cdylib"]
+name = "tiktoken"
[dependencies]
-pyo3 = { version = "0.17.3", features = ["extension-module"] }
-
-# tiktoken dependencies
-fancy-regex = "0.10.0"
+fancy-regex = "0.11.0"
regex = "1.7.0"
rustc-hash = "1.1.0"
bstr = "1.0.1"
+anyhow = "1.0.70"
[profile.release]
incremental = true
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 7f25b27..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,8 +0,0 @@
-include *.svg
-include *.toml
-include *.md
-include Makefile
-global-include py.typed
-recursive-include scripts *.py
-recursive-include tests *.py
-recursive-include src *.rs
diff --git a/perf.svg b/perf.svg
deleted file mode 100644
index 723036c..0000000
--- a/perf.svg
+++ /dev/null
@@ -1,374 +0,0 @@
-
-
-
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 739d295..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,41 +0,0 @@
-[project]
-name = "tiktoken"
-version = "0.3.2"
-description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
-readme = "README.md"
-license = {file = "LICENSE"}
-authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
-dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
-optional-dependencies = {blobfile = ["blobfile>=2"]}
-requires-python = ">=3.8"
-
-[project.urls]
-homepage = "https://github.com/openai/tiktoken"
-repository = "https://github.com/openai/tiktoken"
-changelog = "https://github.com/openai/tiktoken/blob/main/CHANGELOG.md"
-
-[build-system]
-build-backend = "setuptools.build_meta"
-requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
-
-[tool.cibuildwheel]
-build-frontend = "build"
-build-verbosity = 1
-
-linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
-linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
-macos.before-all = "rustup target add aarch64-apple-darwin"
-
-skip = [
- "*-manylinux_i686",
- "*-musllinux_i686",
- "*-win32",
-]
-macos.archs = ["x86_64", "arm64"]
-# When cross-compiling on Intel, it is not possible to test arm64 wheels.
-# Warnings will be silenced with following CIBW_TEST_SKIP
-test-skip = "*-macosx_arm64"
-
-before-test = "pip install pytest"
-test-command = "pytest {project}/tests"
-
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
deleted file mode 100644
index 4d679fa..0000000
--- a/scripts/benchmark.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import base64
-import functools
-import gzip
-import json
-import os
-import random
-import time
-from typing import Any, cast
-
-import blobfile
-
-import tiktoken
-
-
-def benchmark_batch(documents: list[str]) -> None:
- num_threads = int(os.environ["RAYON_NUM_THREADS"])
- num_bytes = sum(map(len, map(str.encode, documents)))
- print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
-
- enc = tiktoken.get_encoding("gpt2")
- enc.encode("warmup")
-
- start = time.perf_counter_ns()
- enc.encode_ordinary_batch(documents, num_threads=num_threads)
- end = time.perf_counter_ns()
- print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
-
- import transformers
-
- hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
- hf_enc.model_max_length = 1e30 # silence!
- hf_enc.encode("warmup")
-
- start = time.perf_counter_ns()
- hf_enc(documents)
- end = time.perf_counter_ns()
- print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
-
-
diff --git a/scripts/redact.py b/scripts/redact.py
deleted file mode 100644
index d82db32..0000000
--- a/scripts/redact.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import argparse
-import re
-import subprocess
-from pathlib import Path
-
-
-def redact_file(path: Path, dry_run: bool) -> None:
- if not path.exists() or path.is_dir():
- return
-
- text = path.read_text()
- if not text:
- return
-
- first_line = text.splitlines()[0]
- if "redact" in first_line:
- if not dry_run:
- path.unlink()
- print(f"Deleted {path}")
- return
-
- pattern = "|".join(
- re.escape(x)
- for x in [
- "# ===== redact-beg =====\n",
- "# ===== redact-end =====\n",
- "\n",
- "\n",
- ]
- )
-
- if re.search(pattern, text):
- redacted_text = "".join(re.split(pattern, text)[::2])
- if not dry_run:
- path.write_text(redacted_text)
- print(f"Redacted {path}")
- return
-
- print(f"Skipped {path}")
-
-
-def redact(dry_run: bool) -> None:
- tiktoken_root = Path(__file__).parent.parent
- assert tiktoken_root.name == "tiktoken"
- assert (tiktoken_root / "pyproject.toml").exists()
-
- try:
- output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
- paths = [Path(p) for p in output.splitlines()]
- except subprocess.CalledProcessError:
- paths = list(tiktoken_root.glob("**/*"))
-
- for path in paths:
- redact_file(path, dry_run=dry_run)
-
-
-def main() -> None:
- parser = argparse.ArgumentParser()
- parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
- args = parser.parse_args()
- redact(args.dry_run)
- if args.dry_run:
- print("Dry run, use --dry-run=false to actually redact files")
-
-
-if __name__ == "__main__":
- main()
diff --git a/setup.py b/setup.py
deleted file mode 100644
index a22e8e5..0000000
--- a/setup.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from setuptools import setup
-from setuptools_rust import Binding, RustExtension
-
-setup(
- name="tiktoken",
- rust_extensions=[
- RustExtension(
- "tiktoken._tiktoken",
- binding=Binding.PyO3,
- # Between our use of editable installs and wanting to use Rust for performance sensitive
- # code, it makes sense to just always use --release
- debug=False,
- )
- ],
- package_data={"tiktoken": ["py.typed"]},
- packages=["tiktoken", "tiktoken_ext"],
- zip_safe=False,
-)
diff --git a/src/lib.rs b/src/lib.rs
index 70009d2..d202a4b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,10 +5,6 @@ use std::collections::HashSet;
use std::thread;
use fancy_regex::Regex;
-use pyo3::exceptions;
-use pyo3::prelude::*;
-use pyo3::types::{PyBytes, PyList, PyTuple};
-use pyo3::PyResult;
use rustc_hash::FxHashMap as HashMap;
fn _byte_pair_merge(
@@ -169,7 +165,6 @@ fn hash_current_thread() -> usize {
}
const MAX_NUM_THREADS: usize = 128;
-#[pyclass]
struct CoreBPE {
encoder: HashMap, usize>,
special_tokens_encoder: HashMap,
@@ -192,19 +187,96 @@ impl CoreBPE {
&self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]
}
- fn _decode_native(&self, tokens: &[usize]) -> Vec {
- let mut ret = Vec::with_capacity(tokens.len() * 2);
- for token in tokens {
- let token_bytes = self
- .decoder
- .get(token)
- .unwrap_or_else(|| &self.special_tokens_decoder[token]);
- ret.extend(token_bytes);
+ fn _increase_last_piece_token_len(
+ &self,
+ tokens: Vec,
+ mut last_piece_token_len: usize,
+ ) -> (Vec, usize) {
+ // Unfortunately, the locations where our regex splits can be unstable.
+ // For the purposes of determining unstable tokens, unstable regex splitting
+ // is only a problem if a split that was present disappears, since this can
+ // lead to merging of tokens otherwise thought to be stable.
+ // cl100k_base makes our life hard by including the \s*[\r\n]+
+ // pattern. This can e.g. cause "\n" + " " to become "\n \n".
+ // Here is a quick and dirty fix:
+ {
+ let token_is_all_space = |token| {
+ self.decoder
+ .get(token)
+ .map(|token_bytes| {
+ token_bytes
+ .iter()
+ .rev()
+ .all(|&b| [b' ', b'\n', b'\t'].contains(&b))
+ })
+ .unwrap_or(false)
+ };
+ if last_piece_token_len > 0
+ && token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
+ {
+ while (last_piece_token_len < tokens.len())
+ && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
+ {
+ last_piece_token_len += 1;
+ }
+ }
}
- ret
+ debug_assert!(last_piece_token_len <= tokens.len());
+
+ (tokens, last_piece_token_len)
+ }
+}
+
+impl CoreBPE {
+ pub fn new(
+ encoder: HashMap, usize>,
+ special_tokens_encoder: HashMap,
+ pattern: &str,
+ ) -> anyhow::Result {
+ let regex = Regex::new(pattern)
+ .map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?;
+
+ let special_regex = {
+ let _parts = special_tokens_encoder
+ .keys()
+ .map(|s| fancy_regex::escape(s))
+ .collect::>();
+ Regex::new(&_parts.join("|"))
+ .map_err(|e| anyhow::anyhow!("Invalid regex: {}", e.to_string()))?
+ };
+
+ let decoder: HashMap> =
+ encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
+
+ assert!(encoder.len() == decoder.len());
+
+ let special_tokens_decoder: HashMap> = special_tokens_encoder
+ .iter()
+ .map(|(k, v)| (*v, k.as_bytes().to_vec()))
+ .collect();
+
+ // Clone because I don't know how to tell Rust I'm not going to change the map
+ let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect();
+ sorted_token_bytes.sort();
+
+ Ok(CoreBPE {
+ encoder,
+ special_tokens_encoder,
+ decoder,
+ special_tokens_decoder,
+ regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
+ special_regex_tls: (0..MAX_NUM_THREADS)
+ .map(|_| special_regex.clone())
+ .collect(),
+ sorted_token_bytes,
+ })
}
- fn _encode_ordinary_native(&self, text: &str) -> Vec {
+ // ====================
+ // Encoding
+ // ====================
+
+ pub fn encode_ordinary(&self, text: &str) -> Vec {
// This is the core of the encoding logic; the other functions in here
// just make things complicated :-)
let regex = self._get_tl_regex();
@@ -220,7 +292,7 @@ impl CoreBPE {
ret
}
- fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec, usize) {
+ pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> (Vec, usize) {
let special_regex = self._get_tl_special_regex();
let regex = self._get_tl_regex();
let mut ret = vec![];
@@ -276,51 +348,37 @@ impl CoreBPE {
(ret, last_piece_token_len)
}
- fn _increase_last_piece_token_len(
- &self,
- tokens: Vec,
- mut last_piece_token_len: usize,
- ) -> (Vec, usize) {
- // Unfortunately, the locations where our regex splits can be unstable.
- // For the purposes of determining unstable tokens, unstable regex splitting
- // is only a problem if a split that was present disappears, since this can
- // lead to merging of tokens otherwise thought to be stable.
- // cl100k_base makes our life hard by including the \s*[\r\n]+
- // pattern. This can e.g. cause "\n" + " " to become "\n \n".
- // Here is a quick and dirty fix:
- {
- let token_is_all_space = |token| {
- self.decoder
- .get(token)
- .map(|token_bytes| {
- token_bytes
- .iter()
- .rev()
- .all(|&b| [b' ', b'\n', b'\t'].contains(&b))
- })
- .unwrap_or(false)
- };
- if last_piece_token_len > 0
- && token_is_all_space(&tokens[tokens.len() - last_piece_token_len])
- {
- while (last_piece_token_len < tokens.len())
- && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])
- {
- last_piece_token_len += 1;
+ fn _encode_bytes(&self, bytes: &[u8]) -> Vec {
+ match std::str::from_utf8(bytes) {
+ Ok(text) => self.encode_ordinary(text),
+ Err(e) => {
+ let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
+ let (tokens, last_piece_token_len) = self.encode(text, HashSet::new());
+ let (mut tokens, last_piece_token_len) =
+ self._increase_last_piece_token_len(tokens, last_piece_token_len);
+ if !tokens.is_empty() && last_piece_token_len > 0 {
+ // Lop off the tokens from the last piece and run BPE on the remaining bytes
+ // Somewhat niche, but this may not be correct if we'd have had a regex
+ // split between the valid UTF-8 and the invalid bytes, which is why this
+ // method is private
+ let mut unstable_bytes =
+ self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
+ unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
+
+ tokens.truncate(tokens.len() - last_piece_token_len);
+ tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
}
+ tokens
}
}
- debug_assert!(last_piece_token_len <= tokens.len());
-
- (tokens, last_piece_token_len)
}
- fn _encode_unstable_native(
+ pub fn encode_with_unstable(
&self,
text: &str,
- allowed_special: &HashSet<&str>,
+ allowed_special: HashSet<&str>,
) -> (Vec, HashSet>) {
- let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special);
+ let (tokens, last_piece_token_len) = self.encode(text, allowed_special);
if last_piece_token_len == 0 {
// If last_piece_token_len is zero, the last token was a special token and we have
// no unstable bytes
@@ -329,7 +387,7 @@ impl CoreBPE {
let (mut tokens, last_piece_token_len) =
self._increase_last_piece_token_len(tokens, last_piece_token_len);
- let unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
+ let unstable_bytes = self.decode_bytes(&tokens[tokens.len() - last_piece_token_len..]);
tokens.truncate(tokens.len() - last_piece_token_len);
// TODO: we should try harder to find additional stable tokens
@@ -377,7 +435,7 @@ impl CoreBPE {
// So convert to UTF-8 and do regex splitting.
// E.g. with cl100k_base " !" gets split to " " + " !",
// but byte_pair_encode(" !") != byte_pair_encode(" ")
- Ok(s) => self._encode_ordinary_native(s),
+ Ok(s) => self.encode_ordinary(s),
// Technically, whether or not this arm is correct depends on whether there
// would be a regex split before the UTF-8 truncation point.
@@ -430,108 +488,8 @@ impl CoreBPE {
(tokens, completions)
}
-}
-#[pymethods]
-impl CoreBPE {
- #[new]
- fn new(
- encoder: HashMap, usize>,
- special_tokens_encoder: HashMap,
- pattern: &str,
- ) -> PyResult {
- let regex = Regex::new(pattern)
- .map_err(|e| PyErr::new::(e.to_string()))?;
-
- let special_regex = {
- let _parts = special_tokens_encoder
- .keys()
- .map(|s| fancy_regex::escape(s))
- .collect::>();
- Regex::new(&_parts.join("|"))
- .map_err(|e| PyErr::new::(e.to_string()))?
- };
-
- let decoder: HashMap> =
- encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
-
- assert!(encoder.len() == decoder.len());
-
- let special_tokens_decoder: HashMap> = special_tokens_encoder
- .iter()
- .map(|(k, v)| (*v, k.as_bytes().to_vec()))
- .collect();
-
- // Clone because I don't know how to tell Rust I'm not going to change the map
- let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect();
- sorted_token_bytes.sort();
-
- Ok(CoreBPE {
- encoder,
- special_tokens_encoder,
- decoder,
- special_tokens_decoder,
- regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
- special_regex_tls: (0..MAX_NUM_THREADS)
- .map(|_| special_regex.clone())
- .collect(),
- sorted_token_bytes,
- })
- }
-
- // ====================
- // Encoding
- // ====================
-
- fn encode_ordinary(&self, py: Python, text: &str) -> Vec {
- py.allow_threads(|| self._encode_ordinary_native(text))
- }
-
- fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec {
- py.allow_threads(|| self._encode_native(text, &allowed_special).0)
- }
-
- fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec {
- py.allow_threads(|| {
- match std::str::from_utf8(bytes) {
- Ok(text) => self._encode_ordinary_native(text),
- Err(e) => {
- let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };
- let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new());
- let (mut tokens, last_piece_token_len) =
- self._increase_last_piece_token_len(tokens, last_piece_token_len);
- if !tokens.is_empty() && last_piece_token_len > 0 {
- // Lop off the tokens from the last piece and run BPE on the remaining bytes
- // Somewhat niche, but this may not be correct if we'd have had a regex
- // split between the valid UTF-8 and the invalid bytes, which is why this
- // method is private
- let mut unstable_bytes =
- self._decode_native(&tokens[tokens.len() - last_piece_token_len..]);
- unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);
-
- tokens.truncate(tokens.len() - last_piece_token_len);
- tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder));
- }
- tokens
- }
- }
- })
- }
-
- fn encode_with_unstable(
- &self,
- py: Python,
- text: &str,
- allowed_special: HashSet<&str>,
- ) -> Py {
- let (tokens, completions) =
- py.allow_threads(|| self._encode_unstable_native(text, &allowed_special));
- let py_completions =
- PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..])));
- (tokens, py_completions).into_py(py)
- }
-
- fn encode_single_token(&self, piece: &[u8]) -> PyResult {
+ pub fn encode_single_token(&self, piece: &[u8]) -> anyhow::Result {
if let Some(token) = self.encoder.get(piece).copied() {
return Ok(token);
}
@@ -540,10 +498,10 @@ impl CoreBPE {
return Ok(token);
}
}
- Err(PyErr::new::(piece.to_owned()))
+ Err(anyhow::anyhow!("Piece {:?} not found", piece))
}
- fn encode_single_piece(&self, piece: &[u8]) -> Vec {
+ pub fn encode_single_piece(&self, piece: &[u8]) -> Vec {
if let Some(token) = self.encoder.get(piece) {
return vec![*token];
}
@@ -554,39 +512,37 @@ impl CoreBPE {
// Decoding
// ====================
- fn decode_bytes(&self, py: Python, tokens: Vec) -> Py {
- let bytes = py.allow_threads(|| self._decode_native(&tokens));
- PyBytes::new(py, &bytes).into()
+ pub fn decode_bytes(&self, tokens: &[usize]) -> Vec {
+ let mut ret = Vec::with_capacity(tokens.len() * 2);
+ for token in tokens {
+ let token_bytes = self
+ .decoder
+ .get(token)
+ .unwrap_or_else(|| &self.special_tokens_decoder[token]);
+ ret.extend(token_bytes);
+ }
+ ret
}
- fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult> {
+ pub fn decode_single_token_bytes(&self, token: usize) -> anyhow::Result<&Vec> {
if let Some(bytes) = self.decoder.get(&token) {
- return Ok(PyBytes::new(py, bytes).into());
+ return Ok(bytes);
}
if let Some(bytes) = self.special_tokens_decoder.get(&token) {
- return Ok(PyBytes::new(py, bytes).into());
+ return Ok(bytes);
}
- Err(PyErr::new::(token.to_string()))
+ Err(anyhow::anyhow!("Token {} not found", token))
}
// ====================
// Miscellaneous
// ====================
- fn token_byte_values(&self, py: Python) -> Vec> {
- self.sorted_token_bytes
- .iter()
- .map(|x| PyBytes::new(py, x).into())
- .collect()
+ pub fn token_byte_values(&self) -> &Vec> {
+ &self.sorted_token_bytes
}
}
-#[pymodule]
-fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
- m.add_class::()?;
- Ok(())
-}
-
#[cfg(test)]
mod tests {
use rustc_hash::FxHashMap as HashMap;
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
deleted file mode 100644
index 8458c12..0000000
--- a/tests/test_simple_public.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import subprocess
-import sys
-
-import tiktoken
-
-
-def test_simple():
- # Note that there are more actual tests, they're just not currently public :-)
- enc = tiktoken.get_encoding("gpt2")
- assert enc.encode("hello world") == [31373, 995]
- assert enc.decode([31373, 995]) == "hello world"
- assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
-
- enc = tiktoken.get_encoding("cl100k_base")
- assert enc.encode("hello world") == [15339, 1917]
- assert enc.decode([15339, 1917]) == "hello world"
- assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
-
- for enc_name in tiktoken.list_encoding_names():
- enc = tiktoken.get_encoding(enc_name)
- for token in range(10_000):
- assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
-
-
-def test_encoding_for_model():
- enc = tiktoken.encoding_for_model("gpt2")
- assert enc.name == "gpt2"
- enc = tiktoken.encoding_for_model("text-davinci-003")
- assert enc.name == "p50k_base"
- enc = tiktoken.encoding_for_model("text-davinci-edit-001")
- assert enc.name == "p50k_edit"
- enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
- assert enc.name == "cl100k_base"
-
-
-def test_optional_blobfile_dependency():
- prog = """
-import tiktoken
-import sys
-assert "blobfile" not in sys.modules
-"""
- subprocess.check_call([sys.executable, "-c", prog])
diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py
deleted file mode 100644
index 9ad09a3..0000000
--- a/tiktoken/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .core import Encoding as Encoding
-from .model import encoding_for_model as encoding_for_model
-from .registry import get_encoding as get_encoding
-from .registry import list_encoding_names as list_encoding_names
diff --git a/tiktoken/core.py b/tiktoken/core.py
deleted file mode 100644
index 05613aa..0000000
--- a/tiktoken/core.py
+++ /dev/null
@@ -1,329 +0,0 @@
-from __future__ import annotations
-
-import functools
-from concurrent.futures import ThreadPoolExecutor
-from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
-
-import regex
-
-from tiktoken import _tiktoken
-
-
-class Encoding:
- def __init__(
- self,
- name: str,
- *,
- pat_str: str,
- mergeable_ranks: dict[bytes, int],
- special_tokens: dict[str, int],
- explicit_n_vocab: Optional[int] = None,
- ):
- """Creates an Encoding object.
-
- See openai_public.py for examples of how to construct an Encoding object.
-
- Args:
- name: The name of the encoding. It should be clear from the name of the encoding
- what behaviour to expect, in particular, encodings with different special tokens
- should have different names.
- pat_str: A regex pattern string that is used to split the input text.
- mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
- must correspond to merge priority.
- special_tokens: A dictionary mapping special token strings to their token values.
- explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
- that the number of mergeable tokens and special tokens is equal to this number.
- """
- self.name = name
-
- self._pat_str = pat_str
- self._mergeable_ranks = mergeable_ranks
- self._special_tokens = special_tokens
-
- self.max_token_value = max(
- max(mergeable_ranks.values()), max(special_tokens.values(), default=0)
- )
- if explicit_n_vocab:
- assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab
- assert self.max_token_value == explicit_n_vocab - 1
-
- self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)
-
- def __repr__(self) -> str:
- return f""
-
- # ====================
- # Encoding
- # ====================
-
- def encode_ordinary(self, text: str) -> list[int]:
- """Encodes a string into tokens, ignoring special tokens.
-
- This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).
-
- ```
- >>> enc.encode_ordinary("hello world")
- [31373, 995]
- """
- return self._core_bpe.encode_ordinary(text)
-
- def encode(
- self,
- text: str,
- *,
- allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
- disallowed_special: Union[Literal["all"], Collection[str]] = "all",
- ) -> list[int]:
- """Encodes a string into tokens.
-
- Special tokens are artificial tokens used to unlock capabilities from a model,
- such as fill-in-the-middle. So we want to be careful about accidentally encoding special
- tokens, since they can be used to trick a model into doing something we don't want it to do.
-
- Hence, by default, encode will raise an error if it encounters text that corresponds
- to a special token. This can be controlled on a per-token level using the `allowed_special`
- and `disallowed_special` parameters. In particular:
- - Setting `disallowed_special` to () will prevent this function from raising errors and
- cause all text corresponding to special tokens to be encoded as natural text.
- - Setting `allowed_special` to "all" will cause this function to treat all text
- corresponding to special tokens to be encoded as special tokens.
-
- ```
- >>> enc.encode("hello world")
- [31373, 995]
- >>> enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})
- [50256]
- >>> enc.encode("<|endoftext|>", allowed_special="all")
- [50256]
- >>> enc.encode("<|endoftext|>")
- # Raises ValueError
- >>> enc.encode("<|endoftext|>", disallowed_special=())
- [27, 91, 437, 1659, 5239, 91, 29]
- ```
- """
- if allowed_special == "all":
- allowed_special = self.special_tokens_set
- if disallowed_special == "all":
- disallowed_special = self.special_tokens_set - allowed_special
- if disallowed_special:
- if not isinstance(disallowed_special, frozenset):
- disallowed_special = frozenset(disallowed_special)
- if match := _special_token_regex(disallowed_special).search(text):
- raise_disallowed_special_token(match.group())
-
- return self._core_bpe.encode(text, allowed_special)
-
- def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
- """Encodes a list of strings into tokens, in parallel, ignoring special tokens.
-
- This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).
-
- ```
- >>> enc.encode_ordinary_batch(["hello world", "goodbye world"])
- [[31373, 995], [11274, 16390, 995]]
- ```
- """
- encoder = functools.partial(self.encode_ordinary)
- with ThreadPoolExecutor(num_threads) as e:
- return list(e.map(encoder, text))
-
- def encode_batch(
- self,
- text: list[str],
- *,
- num_threads: int = 8,
- allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
- disallowed_special: Union[Literal["all"], Collection[str]] = "all",
- ) -> list[list[int]]:
- """Encodes a list of strings into tokens, in parallel.
-
- See `encode` for more details on `allowed_special` and `disallowed_special`.
-
- ```
- >>> enc.encode_batch(["hello world", "goodbye world"])
- [[31373, 995], [11274, 16390, 995]]
- ```
- """
- if allowed_special == "all":
- allowed_special = self.special_tokens_set
- if disallowed_special == "all":
- disallowed_special = self.special_tokens_set - allowed_special
- if not isinstance(disallowed_special, frozenset):
- disallowed_special = frozenset(disallowed_special)
-
- encoder = functools.partial(
- self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special
- )
- with ThreadPoolExecutor(num_threads) as e:
- return list(e.map(encoder, text))
-
- def encode_with_unstable(
- self,
- text: str,
- *,
- allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
- disallowed_special: Union[Literal["all"], Collection[str]] = "all",
- ) -> tuple[list[int], list[list[int]]]:
- """Encodes a string into stable tokens and possible completion sequences.
-
- Note that the stable tokens will only represent a substring of `text`.
-
- See `encode` for more details on `allowed_special` and `disallowed_special`.
-
- This API should itself be considered unstable.
-
- ```
- >>> enc.encode_with_unstable("hello fanta")
- ([31373], [(277, 4910), (5113, 265), ..., (8842,)])
-
- >>> text = "..."
- >>> stable_tokens, completions = enc.encode_with_unstable(text)
- >>> assert text.encode().startswith(enc.decode_bytes(stable_tokens))
- >>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)
- ```
- """
- if allowed_special == "all":
- allowed_special = self.special_tokens_set
- if disallowed_special == "all":
- disallowed_special = self.special_tokens_set - allowed_special
- if disallowed_special:
- if not isinstance(disallowed_special, frozenset):
- disallowed_special = frozenset(disallowed_special)
- if match := _special_token_regex(disallowed_special).search(text):
- raise_disallowed_special_token(match.group())
-
- return self._core_bpe.encode_with_unstable(text, allowed_special)
-
- def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int:
- """Encodes text corresponding to a single token to its token value.
-
- NOTE: this will encode all special tokens.
-
- Raises `KeyError` if the token is not in the vocabulary.
-
- ```
- >>> enc.encode_single_token("hello")
- 31373
- ```
- """
- if isinstance(text_or_bytes, str):
- text_or_bytes = text_or_bytes.encode("utf-8")
- return self._core_bpe.encode_single_token(text_or_bytes)
-
- # ====================
- # Decoding
- # ====================
-
- def decode_bytes(self, tokens: list[int]) -> bytes:
- """Decodes a list of tokens into bytes.
-
- ```
- >>> enc.decode_bytes([31373, 995])
- b'hello world'
- ```
- """
- return self._core_bpe.decode_bytes(tokens)
-
- def decode(self, tokens: list[int], errors: str = "replace") -> str:
- """Decodes a list of tokens into a string.
-
- WARNING: the default behaviour of this function is lossy, since decoded bytes are not
- guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter,
- for instance, setting `errors=strict`.
-
- ```
- >>> enc.decode([31373, 995])
- 'hello world'
- ```
- """
- return self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
-
- def decode_single_token_bytes(self, token: int) -> bytes:
- """Decodes a token into bytes.
-
- NOTE: this will decode all special tokens.
-
- Raises `KeyError` if the token is not in the vocabulary.
-
- ```
- >>> enc.decode_single_token_bytes(31373)
- b'hello'
- ```
- """
- return self._core_bpe.decode_single_token_bytes(token)
-
- def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
- """Decodes a list of tokens into a list of bytes.
-
- Useful for visualising tokenisation.
- >>> enc.decode_tokens_bytes([31373, 995])
- [b'hello', b' world']
- """
- return [self.decode_single_token_bytes(token) for token in tokens]
-
- # ====================
- # Miscellaneous
- # ====================
-
- def token_byte_values(self) -> list[bytes]:
- """Returns the list of all token byte values."""
- return self._core_bpe.token_byte_values()
-
- @property
- def eot_token(self) -> int:
- return self._special_tokens["<|endoftext|>"]
-
- @functools.cached_property
- def special_tokens_set(self) -> set[str]:
- return set(self._special_tokens.keys())
-
- @property
- def n_vocab(self) -> int:
- """For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
- return self.max_token_value + 1
-
- # ====================
- # Private
- # ====================
-
- def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
- """Encodes text corresponding to bytes without a regex split.
-
- NOTE: this will not encode any special tokens.
-
- ```
- >>> enc.encode_single_piece("helloqqqq")
- [31373, 38227, 38227]
- ```
- """
- if isinstance(text_or_bytes, str):
- text_or_bytes = text_or_bytes.encode("utf-8")
- return self._core_bpe.encode_single_piece(text_or_bytes)
-
- def _encode_only_native_bpe(self, text: str) -> list[int]:
- """Encodes a string into tokens, but do regex splitting in Python."""
- _unused_pat = regex.compile(self._pat_str)
- ret = []
- for piece in regex.findall(_unused_pat, text):
- ret.extend(self._core_bpe.encode_single_piece(piece))
- return ret
-
- def _encode_bytes(self, text: bytes) -> list[int]:
- return self._core_bpe._encode_bytes(text)
-
-
-@functools.lru_cache(maxsize=128)
-def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
- inner = "|".join(regex.escape(token) for token in tokens)
- return regex.compile(f"({inner})")
-
-
-def raise_disallowed_special_token(token: str) -> NoReturn:
- raise ValueError(
- f"Encountered text corresponding to disallowed special token {token!r}.\n"
- "If you want this text to be encoded as a special token, "
- f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n"
- f"If you want this text to be encoded as normal text, disable the check for this token "
- f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n"
- "To disable this check for all special tokens, pass `disallowed_special=()`.\n"
- )
diff --git a/tiktoken/load.py b/tiktoken/load.py
deleted file mode 100644
index 4a49ae4..0000000
--- a/tiktoken/load.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from __future__ import annotations
-
-import base64
-import hashlib
-import json
-import os
-import tempfile
-import uuid
-
-import requests
-
-
-def read_file(blobpath: str) -> bytes:
- if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
- try:
- import blobfile
- except ImportError:
- raise ImportError(
- "blobfile is not installed. Please install it by running `pip install blobfile`."
- )
- with blobfile.BlobFile(blobpath, "rb") as f:
- return f.read()
- # avoiding blobfile for public files helps avoid auth issues, like MFA prompts
- return requests.get(blobpath).content
-
-
-def read_file_cached(blobpath: str) -> bytes:
- if "TIKTOKEN_CACHE_DIR" in os.environ:
- cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
- elif "DATA_GYM_CACHE_DIR" in os.environ:
- cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
- else:
- cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
-
- if cache_dir == "":
- # disable caching
- return read_file(blobpath)
-
- cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
-
- cache_path = os.path.join(cache_dir, cache_key)
- if os.path.exists(cache_path):
- with open(cache_path, "rb") as f:
- return f.read()
-
- contents = read_file(blobpath)
-
- os.makedirs(cache_dir, exist_ok=True)
- tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
- with open(tmp_filename, "wb") as f:
- f.write(contents)
- os.rename(tmp_filename, cache_path)
-
- return contents
-
-
-def data_gym_to_mergeable_bpe_ranks(
- vocab_bpe_file: str, encoder_json_file: str
-) -> dict[bytes, int]:
- # NB: do not add caching to this function
- rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
-
- data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte}
- n = 0
- for b in range(2**8):
- if b not in rank_to_intbyte:
- rank_to_intbyte.append(b)
- data_gym_byte_to_byte[chr(2**8 + n)] = b
- n += 1
- assert len(rank_to_intbyte) == 2**8
-
- # vocab_bpe contains the merges along with associated ranks
- vocab_bpe_contents = read_file_cached(vocab_bpe_file).decode()
- bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
-
- def decode_data_gym(value: str) -> bytes:
- return bytes(data_gym_byte_to_byte[b] for b in value)
-
- # add the single byte tokens
- bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}
- # add the merged tokens
- n = len(bpe_ranks)
- for first, second in bpe_merges:
- bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n
- n += 1
-
- # check that the encoder file matches the merges file
- # this sanity check is important since tiktoken assumes that ranks are ordered the same
- # as merge priority
- encoder_json = json.loads(read_file_cached(encoder_json_file))
- encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
- # drop these two special tokens if present, since they're not mergeable bpe tokens
- encoder_json_loaded.pop(b"<|endoftext|>", None)
- encoder_json_loaded.pop(b"<|startoftext|>", None)
- assert bpe_ranks == encoder_json_loaded
-
- return bpe_ranks
-
-
-def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
- try:
- import blobfile
- except ImportError:
- raise ImportError(
- "blobfile is not installed. Please install it by running `pip install blobfile`."
- )
- with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
- for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
- f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
-
-
-def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
- # NB: do not add caching to this function
- contents = read_file_cached(tiktoken_bpe_file)
- return {
- base64.b64decode(token): int(rank)
- for token, rank in (line.split() for line in contents.splitlines() if line)
- }
diff --git a/tiktoken/model.py b/tiktoken/model.py
deleted file mode 100644
index b8af787..0000000
--- a/tiktoken/model.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from __future__ import annotations
-
-from .core import Encoding
-from .registry import get_encoding
-
-# TODO: these will likely be replaced by an API endpoint
-MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
- # chat
- "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
- "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
-}
-
-MODEL_TO_ENCODING: dict[str, str] = {
- # chat
- "gpt-4": "cl100k_base",
- "gpt-3.5-turbo": "cl100k_base",
- # text
- "text-davinci-003": "p50k_base",
- "text-davinci-002": "p50k_base",
- "text-davinci-001": "r50k_base",
- "text-curie-001": "r50k_base",
- "text-babbage-001": "r50k_base",
- "text-ada-001": "r50k_base",
- "davinci": "r50k_base",
- "curie": "r50k_base",
- "babbage": "r50k_base",
- "ada": "r50k_base",
- # code
- "code-davinci-002": "p50k_base",
- "code-davinci-001": "p50k_base",
- "code-cushman-002": "p50k_base",
- "code-cushman-001": "p50k_base",
- "davinci-codex": "p50k_base",
- "cushman-codex": "p50k_base",
- # edit
- "text-davinci-edit-001": "p50k_edit",
- "code-davinci-edit-001": "p50k_edit",
- # embeddings
- "text-embedding-ada-002": "cl100k_base",
- # old embeddings
- "text-similarity-davinci-001": "r50k_base",
- "text-similarity-curie-001": "r50k_base",
- "text-similarity-babbage-001": "r50k_base",
- "text-similarity-ada-001": "r50k_base",
- "text-search-davinci-doc-001": "r50k_base",
- "text-search-curie-doc-001": "r50k_base",
- "text-search-babbage-doc-001": "r50k_base",
- "text-search-ada-doc-001": "r50k_base",
- "code-search-babbage-code-001": "r50k_base",
- "code-search-ada-code-001": "r50k_base",
- # open source
- "gpt2": "gpt2",
-}
-
-
-def encoding_for_model(model_name: str) -> Encoding:
- """Returns the encoding used by a model."""
- encoding_name = None
- if model_name in MODEL_TO_ENCODING:
- encoding_name = MODEL_TO_ENCODING[model_name]
- else:
- # Check if the model matches a known prefix
- # Prefix matching avoids needing library updates for every model version release
- # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
- for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
- if model_name.startswith(model_prefix):
- return get_encoding(model_encoding_name)
-
- if encoding_name is None:
- raise KeyError(
- f"Could not automatically map {model_name} to a tokeniser. "
- "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
- ) from None
-
- return get_encoding(encoding_name)
diff --git a/tiktoken/py.typed b/tiktoken/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
deleted file mode 100644
index 52d8ec2..0000000
--- a/tiktoken/registry.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from __future__ import annotations
-
-import importlib
-import pkgutil
-import threading
-from typing import Any, Callable, Optional
-
-import tiktoken_ext
-
-from tiktoken.core import Encoding
-
-_lock = threading.RLock()
-ENCODINGS: dict[str, Encoding] = {}
-ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
-
-
-def _find_constructors() -> None:
- global ENCODING_CONSTRUCTORS
- with _lock:
- if ENCODING_CONSTRUCTORS is not None:
- return
- ENCODING_CONSTRUCTORS = {}
-
- # tiktoken_ext is a namespace package
- # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
- # - we use namespace package pattern so `pkgutil.iter_modules` is fast
- # - it's a separate top-level package because namespace subpackages of non-namespace
- # packages don't quite do what you want with editable installs
- plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
-
- for _, mod_name, _ in plugin_mods:
- mod = importlib.import_module(mod_name)
- try:
- constructors = mod.ENCODING_CONSTRUCTORS
- except AttributeError as e:
- raise ValueError(
- f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS"
- ) from e
- for enc_name, constructor in constructors.items():
- if enc_name in ENCODING_CONSTRUCTORS:
- raise ValueError(
- f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}"
- )
- ENCODING_CONSTRUCTORS[enc_name] = constructor
-
-
-def get_encoding(encoding_name: str) -> Encoding:
- if encoding_name in ENCODINGS:
- return ENCODINGS[encoding_name]
-
- with _lock:
- if encoding_name in ENCODINGS:
- return ENCODINGS[encoding_name]
-
- if ENCODING_CONSTRUCTORS is None:
- _find_constructors()
- assert ENCODING_CONSTRUCTORS is not None
-
- if encoding_name not in ENCODING_CONSTRUCTORS:
- raise ValueError(f"Unknown encoding {encoding_name}")
-
- constructor = ENCODING_CONSTRUCTORS[encoding_name]
- enc = Encoding(**constructor())
- ENCODINGS[encoding_name] = enc
- return enc
-
-
-def list_encoding_names() -> list[str]:
- with _lock:
- if ENCODING_CONSTRUCTORS is None:
- _find_constructors()
- assert ENCODING_CONSTRUCTORS is not None
- return list(ENCODING_CONSTRUCTORS)
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
deleted file mode 100644
index 16a6ec5..0000000
--- a/tiktoken_ext/openai_public.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
-
-ENDOFTEXT = "<|endoftext|>"
-FIM_PREFIX = "<|fim_prefix|>"
-FIM_MIDDLE = "<|fim_middle|>"
-FIM_SUFFIX = "<|fim_suffix|>"
-ENDOFPROMPT = "<|endofprompt|>"
-
-
-def gpt2():
- mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
- vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
- encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
- )
- return {
- "name": "gpt2",
- "explicit_n_vocab": 50257,
- "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {"<|endoftext|>": 50256},
- }
-
-
-def r50k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
- )
- return {
- "name": "r50k_base",
- "explicit_n_vocab": 50257,
- "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {ENDOFTEXT: 50256},
- }
-
-
-def p50k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
- )
- return {
- "name": "p50k_base",
- "explicit_n_vocab": 50281,
- "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": {ENDOFTEXT: 50256},
- }
-
-
-def p50k_edit():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
- )
- special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
- return {
- "name": "p50k_edit",
- "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
-
-
-def cl100k_base():
- mergeable_ranks = load_tiktoken_bpe(
- "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
- )
- special_tokens = {
- ENDOFTEXT: 100257,
- FIM_PREFIX: 100258,
- FIM_MIDDLE: 100259,
- FIM_SUFFIX: 100260,
- ENDOFPROMPT: 100276,
- }
- return {
- "name": "cl100k_base",
- "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
- "mergeable_ranks": mergeable_ranks,
- "special_tokens": special_tokens,
- }
-
-
-ENCODING_CONSTRUCTORS = {
- "gpt2": gpt2,
- "r50k_base": r50k_base,
- "p50k_base": p50k_base,
- "p50k_edit": p50k_edit,
- "cl100k_base": cl100k_base,
-}