Update codebase
This commit is contained in:
parent
0f8ec705e2
commit
40d9b1f14e
2
.github/workflows/build_wheels.yml
vendored
2
.github/workflows/build_wheels.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
# cibuildwheel builds linux wheels inside a manylinux container
|
||||
# it also takes care of procuring the correct python version for us
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python-version: [39, 310, 311]
|
||||
python-version: [38, 39, 310, 311]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
12
CHANGELOG.md
Normal file
12
CHANGELOG.md
Normal file
@ -0,0 +1,12 @@
|
||||
# Changelog
|
||||
|
||||
This is the changelog for the open source version of tiktoken.
|
||||
|
||||
## [v0.1.2]
|
||||
- Avoid use of `blobfile` for public files
|
||||
- Add support for Python 3.8
|
||||
- Add py.typed
|
||||
- Improve the public tests
|
||||
|
||||
## [v0.1.1]
|
||||
- Initial release
|
@ -1,6 +1,8 @@
|
||||
include *.svg
|
||||
include *.toml
|
||||
include *.md
|
||||
include Makefile
|
||||
global-include py.typed
|
||||
recursive-include scripts *.py
|
||||
recursive-include tests *.py
|
||||
recursive-include src *.rs
|
||||
|
@ -1,8 +1,8 @@
|
||||
[project]
|
||||
name = "tiktoken"
|
||||
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
|
||||
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
|
||||
dynamic = ["version"]
|
||||
requires-python = ">=3.9"
|
||||
requires-python = ">=3.8"
|
||||
|
||||
[build-system]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None:
|
||||
return
|
||||
|
||||
text = path.read_text()
|
||||
if not text:
|
||||
return
|
||||
|
||||
first_line = text.splitlines()[0]
|
||||
if "redact" in first_line:
|
||||
|
3
setup.py
3
setup.py
@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension
|
||||
public = True
|
||||
|
||||
if public:
|
||||
version = "0.1.1"
|
||||
version = "0.1.2"
|
||||
|
||||
setup(
|
||||
name="tiktoken",
|
||||
@ -18,6 +18,7 @@ setup(
|
||||
debug=False,
|
||||
)
|
||||
],
|
||||
package_data={"tiktoken": ["py.typed"]},
|
||||
packages=["tiktoken", "tiktoken_ext"],
|
||||
zip_safe=False,
|
||||
)
|
||||
|
@ -2,10 +2,18 @@ import tiktoken
|
||||
|
||||
|
||||
def test_simple():
|
||||
# Note that there are more actual tests, they're just not currently public :-)
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
assert enc.encode("hello world") == [31373, 995]
|
||||
assert enc.decode([31373, 995]) == "hello world"
|
||||
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
|
||||
|
||||
enc = tiktoken.get_encoding("cl100k_base")
|
||||
assert enc.encode("hello world") == [15339, 1917]
|
||||
assert enc.decode([15339, 1917]) == "hello world"
|
||||
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
|
||||
|
||||
for enc_name in tiktoken.list_encoding_names():
|
||||
enc = tiktoken.get_encoding(enc_name)
|
||||
for token in range(10_000):
|
||||
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
|
||||
|
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
|
||||
|
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
@ -5,6 +7,15 @@ import os
|
||||
import uuid
|
||||
|
||||
import blobfile
|
||||
import requests
|
||||
|
||||
|
||||
def read_file(blobpath: str) -> bytes:
|
||||
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
|
||||
with blobfile.BlobFile(blobpath, "rb") as f:
|
||||
return f.read()
|
||||
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
|
||||
return requests.get(blobpath).content
|
||||
|
||||
|
||||
def read_file_cached(blobpath: str) -> bytes:
|
||||
@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes:
|
||||
|
||||
if cache_dir == "":
|
||||
# disable caching
|
||||
with blobfile.BlobFile(blobpath, "rb") as f:
|
||||
return f.read()
|
||||
return read_file(blobpath)
|
||||
|
||||
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
|
||||
|
||||
@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes:
|
||||
with open(cache_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
with blobfile.BlobFile(blobpath, "rb") as f:
|
||||
contents = f.read()
|
||||
contents = read_file(blobpath)
|
||||
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
|
||||
|
0
tiktoken/py.typed
Normal file
0
tiktoken/py.typed
Normal file
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import pkgutil
|
||||
import threading
|
||||
|
@ -9,8 +9,8 @@ ENDOFPROMPT = "<|endofprompt|>"
|
||||
|
||||
def gpt2():
|
||||
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
|
||||
vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
|
||||
encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
|
||||
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
|
||||
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
|
||||
)
|
||||
return {
|
||||
"name": "gpt2",
|
||||
@ -22,7 +22,9 @@ def gpt2():
|
||||
|
||||
|
||||
def r50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||
)
|
||||
return {
|
||||
"name": "r50k_base",
|
||||
"explicit_n_vocab": 50257,
|
||||
@ -33,7 +35,9 @@ def r50k_base():
|
||||
|
||||
|
||||
def p50k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
)
|
||||
return {
|
||||
"name": "p50k_base",
|
||||
"explicit_n_vocab": 50281,
|
||||
@ -44,7 +48,9 @@ def p50k_base():
|
||||
|
||||
|
||||
def cl100k_base():
|
||||
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
|
||||
mergeable_ranks = load_tiktoken_bpe(
|
||||
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
)
|
||||
special_tokens = {
|
||||
ENDOFTEXT: 100257,
|
||||
FIM_PREFIX: 100258,
|
||||
|
Loading…
x
Reference in New Issue
Block a user