Bump version, sync codebase
This commit is contained in:
parent
3e8620030c
commit
446cb49aff
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
This is the changelog for the open source version of tiktoken.
|
This is the changelog for the open source version of tiktoken.
|
||||||
|
|
||||||
|
## [v0.3.2]
|
||||||
|
- Add encoding for GPT-4
|
||||||
|
|
||||||
## [v0.3.1]
|
## [v0.3.1]
|
||||||
- Build aarch64 wheels
|
- Build aarch64 wheels
|
||||||
- Make `blobfile` an optional dependency
|
- Make `blobfile` an optional dependency
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tiktoken"
|
name = "tiktoken"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.57.0"
|
rust-version = "1.57.0"
|
||||||
|
|
||||||
|
@ -5,11 +5,11 @@ OpenAI's models.
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import tiktoken
|
import tiktoken
|
||||||
enc = tiktoken.get_encoding("gpt2")
|
enc = tiktoken.get_encoding("cl100k_base")
|
||||||
assert enc.decode(enc.encode("hello world")) == "hello world"
|
assert enc.decode(enc.encode("hello world")) == "hello world"
|
||||||
|
|
||||||
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
# To get the tokeniser corresponding to a specific model in the OpenAI API:
|
||||||
enc = tiktoken.encoding_for_model("text-davinci-003")
|
enc = tiktoken.encoding_for_model("gpt-4")
|
||||||
```
|
```
|
||||||
|
|
||||||
The open source version of `tiktoken` can be installed from PyPI:
|
The open source version of `tiktoken` can be installed from PyPI:
|
||||||
|
@ -34,7 +34,7 @@ fn _byte_pair_merge<T>(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// We look up the ranks once in the beggining and iteratively update
|
// We look up the ranks once in the beginning and iteratively update
|
||||||
// them during each merge, which reduces the number of rank lookups.
|
// them during each merge, which reduces the number of rank lookups.
|
||||||
for i in 0..parts.len() - 2 {
|
for i in 0..parts.len() - 2 {
|
||||||
match get_rank(&parts, i, 0) {
|
match get_rank(&parts, i, 0) {
|
||||||
|
@ -6,11 +6,13 @@ from .registry import get_encoding
|
|||||||
# TODO: these will likely be replaced by an API endpoint
|
# TODO: these will likely be replaced by an API endpoint
|
||||||
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||||
# chat
|
# chat
|
||||||
"gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||||
|
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TO_ENCODING: dict[str, str] = {
|
MODEL_TO_ENCODING: dict[str, str] = {
|
||||||
# chat
|
# chat
|
||||||
|
"gpt-4": "cl100k_base",
|
||||||
"gpt-3.5-turbo": "cl100k_base",
|
"gpt-3.5-turbo": "cl100k_base",
|
||||||
# text
|
# text
|
||||||
"text-davinci-003": "p50k_base",
|
"text-davinci-003": "p50k_base",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user