diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2790421..a7dce9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.3.1]
+- Build aarch64 wheels
+- Make `blobfile` an optional dependency
+
+Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
+
 ## [v0.3.0]
 - Improve performance by 5-20%; thank you to @nistath!
 - Add `gpt-3.5-turbo` models to `encoding_for_model`
@@ -14,6 +20,8 @@ This is the changelog for the open source version of tiktoken.
 - Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
 - Improve portability of caching logic
 
+Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
+
 ## [v0.1.2]
 - Avoid use of `blobfile` for public files
 - Add support for Python 3.8
diff --git a/Cargo.toml b/Cargo.toml
index 40a72b9..912af00 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.3.0"
+version = "0.3.1"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
index 791e3c7..1834ef1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,11 +1,12 @@
 [project]
 name = "tiktoken"
-version = "0.3.0"
+version = "0.3.1"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
 authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}]
-dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
+dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
+optional-dependencies = {blobfile = ["blobfile>=2"]}
 requires-python = ">=3.8"
 
 [project.urls]
diff --git a/src/lib.rs b/src/lib.rs
index b44d9c8..f391005 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -21,32 +21,23 @@ fn _byte_pair_merge<T>(
     // The rank of the last item in the vector is not a valid value.
     let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect();
 
-    // NOTE: using a macro here because a closure fails to get inlined
-    // according to optimization remarks.
-    // A closure also cannot capture a reference to `piece` without
-    // the borrow checker complaining about the mutable borrows during
-    // the assignments later in this code.
-    macro_rules! get_rank {
-        ($start_idx:expr, $skip:expr) => {{
-            let start_idx: usize = $start_idx;
-            let skip: usize = $skip;
+    let get_rank = {
+        #[inline(always)]
+        |parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| {
             if (start_idx + skip + 2) < parts.len() {
                 ranks
                     .get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0])
-                    .map(|r| *r)
+                    .copied()
             } else {
                 None
             }
-        }};
-        ($idx:expr) => {{
-            get_rank!($idx, 0)
-        }};
-    }
+        }
+    };
 
     // We look up the ranks once in the beggining and iteratively update
     // them during each merge, which reduces the number of rank lookups.
     for i in 0..parts.len() - 2 {
-        match get_rank!(i) {
+        match get_rank(&parts, i, 0) {
             Some(rank) => {
                 // usize::MAX is a sentinel value and cannot be a valid rank
                 debug_assert!(rank != usize::MAX);
@@ -89,9 +80,9 @@ fn _byte_pair_merge<T>(
             // parts[i] and parts[i-1] before removing, which could thrash
             // the cache. Thus, we update the rank calculation by skipping over
             // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
-            parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX);
+            parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX);
             if i > 0 {
-                parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX);
+                parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX);
             }
 
             parts.remove(i + 1);
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
index a7d70b5..8458c12 100644
--- a/tests/test_simple_public.py
+++ b/tests/test_simple_public.py
@@ -1,3 +1,6 @@
+import subprocess
+import sys
+
 import tiktoken
 
 
@@ -28,3 +31,12 @@ def test_encoding_for_model():
     assert enc.name == "p50k_edit"
     enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
     assert enc.name == "cl100k_base"
+
+
+def test_optional_blobfile_dependency():
+    prog = """
+import tiktoken
+import sys
+assert "blobfile" not in sys.modules
+"""
+    subprocess.check_call([sys.executable, "-c", prog])
diff --git a/tiktoken/load.py b/tiktoken/load.py
index c588106..4a49ae4 100644
--- a/tiktoken/load.py
+++ b/tiktoken/load.py
@@ -7,12 +7,17 @@ import os
 import tempfile
 import uuid
 
-import blobfile
 import requests
 
 
 def read_file(blobpath: str) -> bytes:
     if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
+        try:
+            import blobfile
+        except ImportError:
+            raise ImportError(
+                "blobfile is not installed. Please install it by running `pip install blobfile`."
+            )
         with blobfile.BlobFile(blobpath, "rb") as f:
             return f.read()
     # avoiding blobfile for public files helps avoid auth issues, like MFA prompts
@@ -93,6 +98,12 @@ def data_gym_to_mergeable_bpe_ranks(
 
 
 def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
+    try:
+        import blobfile
+    except ImportError:
+        raise ImportError(
+            "blobfile is not installed. Please install it by running `pip install blobfile`."
+        )
     with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
         for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
             f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")