From 446cb49affe9955eba1eb687f2eb9580c501a3e5 Mon Sep 17 00:00:00 2001
From: Shantanu Jain <shantanu@openai.com>
Date: Thu, 16 Mar 2023 18:11:50 -0700
Subject: [PATCH] Bump version, sync codebase

---
 CHANGELOG.md      | 3 +++
 Cargo.toml        | 2 +-
 README.md         | 4 ++--
 src/lib.rs        | 2 +-
 tiktoken/model.py | 4 +++-
 5 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7dce9d..d0365b8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.3.2]
+- Add encoding for GPT-4
+
 ## [v0.3.1]
 - Build aarch64 wheels
 - Make `blobfile` an optional dependency
diff --git a/Cargo.toml b/Cargo.toml
index 912af00..07182cd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.3.1"
+version = "0.3.2"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/README.md b/README.md
index 6a5c5f2..c96f1b4 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,11 @@ OpenAI's models.
 
 ```python
 import tiktoken
-enc = tiktoken.get_encoding("gpt2")
+enc = tiktoken.get_encoding("cl100k_base")
 assert enc.decode(enc.encode("hello world")) == "hello world"
 
 # To get the tokeniser corresponding to a specific model in the OpenAI API:
-enc = tiktoken.encoding_for_model("text-davinci-003")
+enc = tiktoken.encoding_for_model("gpt-4")
 ```
 
 The open source version of `tiktoken` can be installed from PyPI:
diff --git a/src/lib.rs b/src/lib.rs
index f391005..70009d2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,7 +34,7 @@ fn _byte_pair_merge<T>(
         }
     };
 
-    // We look up the ranks once in the beggining and iteratively update
+    // We look up the ranks once in the beginning and iteratively update
     // them during each merge, which reduces the number of rank lookups.
     for i in 0..parts.len() - 2 {
         match get_rank(&parts, i, 0) {
diff --git a/tiktoken/model.py b/tiktoken/model.py
index 33da390..b8af787 100644
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@@ -6,11 +6,13 @@ from .registry import get_encoding
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
-    "gpt-3.5-turbo-": "cl100k_base"  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
+    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
     # chat
+    "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
     # text
     "text-davinci-003": "p50k_base",