mlfoundations · GeorgiosSmyrnis · Mar 12, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/open_lm/datapreprocess/ray/tokenize_shuffle.py b/open_lm/datapreprocess/ray/tokenize_shuffle.py
@@ -68,7 +68,7 @@ def load_from_yaml(filename):
     # Add get_source and get_sampling_frequency methods to Sources
     def get_source_dynamic(self, key):
         for item in data["sources"]:
-            if any(marker in key for marker in item["markers"]):
+            if any(marker.lower() in key.lower() for marker in item["markers"]):
                 return Sources[item["source"]]
         return Sources.UNKNOWN
 
@@ -234,7 +234,7 @@ def _flush_buffer(self, folder, counter):
                 tokens = [int(x) for x in self.buffer[i]["tokens"]]
                 token_count += len(tokens)
                 json_string = json.dumps(tokens)
-                uid = hashlib.md5(json_string.encode()).hexdigest()
+                uid = f"{hashlib.md5(json_string.encode()).hexdigest()}_{tar_index:{digits}}_{i:0{digits}}"
                 sample = {"__key__": uid, "json.gz": json_string}
                 sink.write(sample)
         bio.seek(0)
@@ -304,9 +304,21 @@ def preprocess(
             buffer = buffer[idx:]
 
         if len(buffer) > 0:
-            if source_counter is not None:
-                ray.get(source_counter.increment_token_count.remote(len(buffer)))
-            yield buffer + [PAD] * (seqlen - len(buffer))
+            if do_sample:
+                local_sample_freq = sample_freq
+                while local_sample_freq > 1:
+                    if source_counter is not None:
+                        ray.get(source_counter.increment_token_count.remote(len(buffer)))
+                    yield buffer + [PAD] * (seqlen - len(buffer))
+                    local_sample_freq -= 1
+                if rng.random() < local_sample_freq:
+                    if source_counter is not None:
+                        ray.get(source_counter.increment_token_count.remote(len(buffer)))
+                    yield buffer + [PAD] * (seqlen - len(buffer))
+            else:
+                if source_counter is not None:
+                    ray.get(source_counter.increment_token_count.remote(len(buffer)))
+                yield buffer + [PAD] * (seqlen - len(buffer))
 
     except (IncompleteReadError, ReadTimeoutError, ResponseStreamingError) as e:
         logger.error(f"There was an incomplete read error: {e} for key {key}")

diff --git a/tests/assets/test_sample.yaml b/tests/assets/test_sample.yaml
@@ -0,0 +1,13 @@
+sources:
+  - source: "SOURCE_A"
+    markers: ["source_a"]
+  - source: "SOURCE_B"
+    markers: ["source_b"]
+  - source: "UNKNOWN"
+    markers: []  # No specific markers for UNKNOWN
+
+sampling_frequencies:
+  SOURCE_A: 2.0
+  SOURCE_B: 0.5
+  UNKNOWN: 0
+
diff --git a/tests/test_tokenize_shuffle.py b/tests/test_tokenize_shuffle.py
@@ -2,6 +2,7 @@
 import os
 import pytest
 import webdataset as wds
+import numpy as np
 
 
 @pytest.fixture(autouse=True)
@@ -24,7 +25,7 @@ def test_tokenize_shuffle_simple():
     for x in ds:
         assert len(x["json.gz"]) == content_len + 1
         total += len(x["json.gz"])
-    # assert total == NUM_TOKENS
+    assert total == NUM_TOKENS
 
     with open("test_output/manifest.jsonl", "rb") as f:
         out = f.read()
@@ -56,7 +57,7 @@ def test_tokenize_shuffle_tar(content_key, NUM_TOKENS):
 
 def test_tokenize_shuffle_simple_do_sample():
     content_len = 2048
-    NUM_TOKENS = 32784
+    NUM_TOKENS = 86058
     exit_value = os.system(
         f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input s3://dcnlp-west-test/tokenize_shuffle_test/C4_V3_tiny/ --content_key content --output test_output/ --seqlen {content_len} --do_sample"
     )
@@ -66,7 +67,12 @@ def test_tokenize_shuffle_simple_do_sample():
     for x in ds:
         assert len(x["json.gz"]) == content_len + 1
         total += len(x["json.gz"])
-    assert total == NUM_TOKENS
+
+    # The sampling prob is 1.037142857 for the C4 source. This means that we will see all tokens at least once. For
+    # error at most 1e-4, we will need an error of 13950 tokens (by Chernoff bounds).
+    # TODO(gsmyrnis): Improve this.
+    assert total <= 1.037142857 * NUM_TOKENS + 13950
+    assert total >= 1.037142857 * NUM_TOKENS - 13950
 
 
 @pytest.mark.s3
@@ -115,3 +121,134 @@ def test_tokenize_shuffle_local_read_local_write():
             total += len(x["json.gz"])
     assert total == NUM_TOKENS
     assert exit_value == 0
+
+
+@pytest.mark.parametrize("num_sources", [2, 3])
+@pytest.mark.parametrize("generation_length", [1024, 2048, 2500])
+def test_mixing_no_sampling(num_sources, generation_length):
+    content_len = 2048
+    docs_a = 1000
+    docs_b = 500
+    docs_c = 2000
+
+    # Tokens for gpt-neox tokenizer (default)
+    token_a = 247
+    token_b = 270
+    token_c = 260
+
+    # Store some fake sources in ./test_input
+    os.system("mkdir test_input")
+    os.system("mkdir test_input/source_a/")
+    os.system("mkdir test_input/source_b/")
+    os.system("mkdir test_input/source_c/")
+    os.system("mkdir test_output")
+
+    for i in range(docs_a // 100):
+        with open(f"test_input/source_a/input_{i:08d}.jsonl", "w") as f:
+            # This will create 2048 copies of the " a" string
+            data = {"text": " " + " ".join(["a" for _ in range(generation_length)])}
+            json_string = json.dumps(data)
+            for _ in range(100):
+                f.write(json_string)
+                f.write("\n")
+
+    for i in range(docs_b // 100):
+        with open(f"test_input/source_b/input_{i:08d}.jsonl", "w") as f:
+            data = {"text": " " + " ".join(["b" for _ in range(generation_length)])}
+            json_string = json.dumps(data)
+            for _ in range(100):
+                f.write(json_string)
+                f.write("\n")
+
+    if num_sources == 3:
+        for i in range(docs_c // 100):
+            with open(f"test_input/source_c/input_{i:08d}.jsonl", "w") as f:
+                data = {"text": " " + " ".join(["c" for _ in range(generation_length)])}
+                json_string = json.dumps(data)
+                for _ in range(100):
+                    f.write(json_string)
+                    f.write("\n")
+
+    # run tokenize script
+    exit_value = os.system(
+        f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input ./test_input --content_key text --seqlen {content_len} --output ./test_output/"
+    )
+
+    tars = [os.path.join("test_output", fname) for fname in os.listdir("test_output") if fname.endswith(".tar")]
+    total_a = total_b = total_c = 0
+    for tar in tars:
+        ds = wds.WebDataset(tar).decode()
+        for x in ds:
+            assert len(x["json.gz"]) == content_len + 1
+            tokens = np.array(x["json.gz"])
+            total_a += np.sum(tokens == token_a)
+            total_b += np.sum(tokens == token_b)
+            total_c += np.sum(tokens == token_c)
+
+    assert total_a == docs_a * generation_length
+    assert total_b == docs_b * generation_length
+    if num_sources == 3:
+        assert total_c == docs_c * generation_length
+
+    assert exit_value == 0
+
+
+@pytest.mark.parametrize("generation_length", [1024, 2048, 2500])
+def test_mixing_sampling(generation_length):
+    content_len = 2048
+    docs_a = 10000
+    docs_b = 10000
+
+    # Tokens for gpt-neox tokenizer (default)
+    token_a = 247
+    token_b = 270
+
+    # Store some fake sources in ./test_input
+    os.system("mkdir test_input")
+    os.system("mkdir test_input/source_a/")
+    os.system("mkdir test_input/source_b/")
+    os.system("mkdir test_output")
+
+    for i in range(docs_a // 100):
+        with open(f"test_input/source_a/input_{i:08d}.jsonl", "w") as f:
+            # This will create 2048 copies of the " a" string
+            data = {"text": " " + " ".join(["a" for _ in range(generation_length)])}
+            json_string = json.dumps(data)
+            for _ in range(100):
+                f.write(json_string)
+                f.write("\n")
+
+    for i in range(docs_b // 100):
+        with open(f"test_input/source_b/input_{i:08d}.jsonl", "w") as f:
+            data = {"text": " " + " ".join(["b" for _ in range(generation_length)])}
+            json_string = json.dumps(data)
+            for _ in range(100):
+                f.write(json_string)
+                f.write("\n")
+
+    # run tokenize script
+    exit_value = os.system(
+        f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input ./test_input --content_key text --seqlen {content_len} --output ./test_output/ --do_sample --default_dataset_yaml ./tests/assets/test_sample.yaml"
+    )
+    assert exit_value == 0
+
+    tars = [os.path.join("test_output", fname) for fname in os.listdir("test_output") if fname.endswith(".tar")]
+    total_a = total_b = 0
+    for tar in tars:
+        ds = wds.WebDataset(tar).decode()
+        for x in ds:
+            assert len(x["json.gz"]) == content_len + 1
+            tokens = np.array(x["json.gz"])
+            total_a += np.sum(tokens == token_a)
+            total_b += np.sum(tokens == token_b)
+
+    # Sampling for source a should be 2.0, so it should be exactly 2
+    assert total_a == 2 * docs_a * generation_length
+
+    # Source b is sampled with probability 0.5, so the number of documents from source b follows Bin(10000, 0.5).
+    # Via (multiplicative) Chernoff bounds, for margin delta the error probability is 2 * exp(-delta**2 * mu / 3)
+    # In this case for error probability <= 1e-4, we need delta * mu = sqrt(-3 * ln(0.5e-4) / mu) * mu ~= 386
+    # TODO (gsmyrnis): I think you can get a better bound here.
+    mixing_error = 386
+    assert total_b <= (0.5 * docs_b + mixing_error) * generation_length
+    assert total_b >= (0.5 * docs_b - mixing_error) * generation_length