40 lines
1000 B
Python
40 lines
1000 B
Python
import base64
|
|
import functools
|
|
import gzip
|
|
import json
|
|
import os
|
|
import random
|
|
import time
|
|
from typing import Any, cast
|
|
|
|
import blobfile
|
|
|
|
import tiktoken
|
|
|
|
|
|
def benchmark_batch(documents: list[str]) -> None:
|
|
num_threads = int(os.environ["RAYON_NUM_THREADS"])
|
|
num_bytes = sum(map(len, map(str.encode, documents)))
|
|
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
|
|
|
|
enc = tiktoken.get_encoding("gpt2")
|
|
enc.encode("warmup")
|
|
|
|
start = time.perf_counter_ns()
|
|
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
|
end = time.perf_counter_ns()
|
|
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
|
|
|
|
import transformers
|
|
|
|
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
|
|
hf_enc.model_max_length = 1e30 # silence!
|
|
hf_enc.encode("warmup")
|
|
|
|
start = time.perf_counter_ns()
|
|
hf_enc(documents)
|
|
end = time.perf_counter_ns()
|
|
print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
|
|
|
|
|