From 40d9b1f14ef221fe040e0d6c3587375632328c25 Mon Sep 17 00:00:00 2001
From: Shantanu Jain <shantanu@openai.com>
Date: Tue, 3 Jan 2023 13:20:35 -0800
Subject: [PATCH] Update codebase

---
 .github/workflows/build_wheels.yml |  2 +-
 CHANGELOG.md                       | 12 ++++++++++++
 MANIFEST.in                        |  2 ++
 pyproject.toml                     |  4 ++--
 scripts/redact.py                  |  2 ++
 setup.py                           |  3 ++-
 tests/test_simple_public.py        |  8 ++++++++
 tiktoken/core.py                   |  2 ++
 tiktoken/load.py                   | 17 +++++++++++++----
 tiktoken/py.typed                  |  0
 tiktoken/registry.py               |  2 ++
 tiktoken_ext/openai_public.py      | 16 +++++++++++-----
 12 files changed, 57 insertions(+), 13 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 tiktoken/py.typed

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index b71f959..d2e8dc2 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -16,7 +16,7 @@ jobs:
         # cibuildwheel builds linux wheels inside a manylinux container
         # it also takes care of procuring the correct python version for us
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: [39, 310, 311]
+        python-version: [38, 39, 310, 311]
 
     steps:
       - uses: actions/checkout@v3
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..a606553
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,12 @@
+# Changelog
+
+This is the changelog for the open source version of tiktoken.
+
+## [v0.1.2]
+- Avoid use of `blobfile` for public files
+- Add support for Python 3.8
+- Add py.typed
+- Improve the public tests
+
+## [v0.1.1]
+- Initial release
diff --git a/MANIFEST.in b/MANIFEST.in
index 558a5ec..7f25b27 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,8 @@
 include *.svg
 include *.toml
+include *.md
 include Makefile
+global-include py.typed
 recursive-include scripts *.py
 recursive-include tests *.py
 recursive-include src *.rs
diff --git a/pyproject.toml b/pyproject.toml
index 0d4327b..4fcd2ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,8 @@
 [project]
 name = "tiktoken"
-dependencies = ["blobfile>=2", "regex>=2022.1.18"]
+dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
 dynamic = ["version"]
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 
 [build-system]
 build-backend = "setuptools.build_meta"
diff --git a/scripts/redact.py b/scripts/redact.py
index bcf8ef1..d82db32 100644
--- a/scripts/redact.py
+++ b/scripts/redact.py
@@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None:
         return
 
     text = path.read_text()
+    if not text:
+        return
 
     first_line = text.splitlines()[0]
     if "redact" in first_line:
diff --git a/setup.py b/setup.py
index d7f9373..179392b 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools_rust import Binding, RustExtension
 public = True
 
 if public:
-    version = "0.1.1"
+    version = "0.1.2"
 
 setup(
     name="tiktoken",
@@ -18,6 +18,7 @@ setup(
             debug=False,
         )
     ],
+    package_data={"tiktoken": ["py.typed"]},
     packages=["tiktoken", "tiktoken_ext"],
     zip_safe=False,
 )
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
index 5b69162..2987e4e 100644
--- a/tests/test_simple_public.py
+++ b/tests/test_simple_public.py
@@ -2,10 +2,18 @@ import tiktoken
 
 
 def test_simple():
+    # Note that there are more actual tests, they're just not currently public :-)
     enc = tiktoken.get_encoding("gpt2")
     assert enc.encode("hello world") == [31373, 995]
     assert enc.decode([31373, 995]) == "hello world"
+    assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
 
     enc = tiktoken.get_encoding("cl100k_base")
     assert enc.encode("hello world") == [15339, 1917]
     assert enc.decode([15339, 1917]) == "hello world"
+    assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
+
+    for enc_name in tiktoken.list_encoding_names():
+        enc = tiktoken.get_encoding(enc_name)
+        for token in range(10_000):
+            assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
diff --git a/tiktoken/core.py b/tiktoken/core.py
index c566a52..d2367bc 100644
--- a/tiktoken/core.py
+++ b/tiktoken/core.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import functools
 from concurrent.futures import ThreadPoolExecutor
 from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
diff --git a/tiktoken/load.py b/tiktoken/load.py
index 06e51cc..fefd62a 100644
--- a/tiktoken/load.py
+++ b/tiktoken/load.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import base64
 import hashlib
 import json
@@ -5,6 +7,15 @@ import os
 import uuid
 
 import blobfile
+import requests
+
+
+def read_file(blobpath: str) -> bytes:
+    if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
+        with blobfile.BlobFile(blobpath, "rb") as f:
+            return f.read()
+    # avoiding blobfile for public files helps avoid auth issues, like MFA prompts
+    return requests.get(blobpath).content
 
 
 def read_file_cached(blobpath: str) -> bytes:
@@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes:
 
     if cache_dir == "":
         # disable caching
-        with blobfile.BlobFile(blobpath, "rb") as f:
-            return f.read()
+        return read_file(blobpath)
 
     cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
 
@@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes:
         with open(cache_path, "rb") as f:
             return f.read()
 
-    with blobfile.BlobFile(blobpath, "rb") as f:
-        contents = f.read()
+    contents = read_file(blobpath)
 
     os.makedirs(cache_dir, exist_ok=True)
     tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
diff --git a/tiktoken/py.typed b/tiktoken/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
index 24bb173..52d8ec2 100644
--- a/tiktoken/registry.py
+++ b/tiktoken/registry.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import importlib
 import pkgutil
 import threading
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
index 756be2d..b0ec6ab 100644
--- a/tiktoken_ext/openai_public.py
+++ b/tiktoken_ext/openai_public.py
@@ -9,8 +9,8 @@ ENDOFPROMPT = "<|endofprompt|>"
 
 def gpt2():
     mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
-        vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
-        encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
+        vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
+        encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
     )
     return {
         "name": "gpt2",
@@ -22,7 +22,9 @@ def gpt2():
 
 
 def r50k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
+    )
     return {
         "name": "r50k_base",
         "explicit_n_vocab": 50257,
@@ -33,7 +35,9 @@ def r50k_base():
 
 
 def p50k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
+    )
     return {
         "name": "p50k_base",
         "explicit_n_vocab": 50281,
@@ -44,7 +48,9 @@ def p50k_base():
 
 
 def cl100k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
+    )
     special_tokens = {
         ENDOFTEXT: 100257,
         FIM_PREFIX: 100258,