Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit test for source mixing + Fix naming within tars. #233

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions open_lm/datapreprocess/ray/tokenize_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def load_from_yaml(filename):
# Add get_source and get_sampling_frequency methods to Sources
def get_source_dynamic(self, key):
for item in data["sources"]:
if any(marker in key for marker in item["markers"]):
if any(marker.lower() in key.lower() for marker in item["markers"]):
return Sources[item["source"]]
return Sources.UNKNOWN

Expand Down Expand Up @@ -234,7 +234,7 @@ def _flush_buffer(self, folder, counter):
tokens = [int(x) for x in self.buffer[i]["tokens"]]
token_count += len(tokens)
json_string = json.dumps(tokens)
uid = hashlib.md5(json_string.encode()).hexdigest()
uid = f"{hashlib.md5(json_string.encode()).hexdigest()}_{tar_index:{digits}}_{i:0{digits}}"
sample = {"__key__": uid, "json.gz": json_string}
sink.write(sample)
bio.seek(0)
Expand Down Expand Up @@ -304,9 +304,21 @@ def preprocess(
buffer = buffer[idx:]

if len(buffer) > 0:
if source_counter is not None:
ray.get(source_counter.increment_token_count.remote(len(buffer)))
yield buffer + [PAD] * (seqlen - len(buffer))
if do_sample:
local_sample_freq = sample_freq
while local_sample_freq > 1:
if source_counter is not None:
ray.get(source_counter.increment_token_count.remote(len(buffer)))
yield buffer + [PAD] * (seqlen - len(buffer))
local_sample_freq -= 1
if rng.random() < local_sample_freq:
if source_counter is not None:
ray.get(source_counter.increment_token_count.remote(len(buffer)))
yield buffer + [PAD] * (seqlen - len(buffer))
else:
if source_counter is not None:
ray.get(source_counter.increment_token_count.remote(len(buffer)))
yield buffer + [PAD] * (seqlen - len(buffer))

except (IncompleteReadError, ReadTimeoutError, ResponseStreamingError) as e:
logger.error(f"There was an incomplete read error: {e} for key {key}")
Expand Down
13 changes: 13 additions & 0 deletions tests/assets/test_sample.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
sources:
- source: "SOURCE_A"
markers: ["source_a"]
- source: "SOURCE_B"
markers: ["source_b"]
- source: "UNKNOWN"
markers: [] # No specific markers for UNKNOWN

sampling_frequencies:
SOURCE_A: 2.0
SOURCE_B: 0.5
UNKNOWN: 0

143 changes: 140 additions & 3 deletions tests/test_tokenize_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pytest
import webdataset as wds
import numpy as np


@pytest.fixture(autouse=True)
Expand All @@ -24,7 +25,7 @@ def test_tokenize_shuffle_simple():
for x in ds:
assert len(x["json.gz"]) == content_len + 1
total += len(x["json.gz"])
# assert total == NUM_TOKENS
assert total == NUM_TOKENS

with open("test_output/manifest.jsonl", "rb") as f:
out = f.read()
Expand Down Expand Up @@ -56,7 +57,7 @@ def test_tokenize_shuffle_tar(content_key, NUM_TOKENS):

def test_tokenize_shuffle_simple_do_sample():
content_len = 2048
NUM_TOKENS = 32784
NUM_TOKENS = 86058
exit_value = os.system(
f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input s3://dcnlp-west-test/tokenize_shuffle_test/C4_V3_tiny/ --content_key content --output test_output/ --seqlen {content_len} --do_sample"
)
Expand All @@ -66,7 +67,12 @@ def test_tokenize_shuffle_simple_do_sample():
for x in ds:
assert len(x["json.gz"]) == content_len + 1
total += len(x["json.gz"])
assert total == NUM_TOKENS

# The sampling prob is 1.037142857 for the C4 source. This means that we will see all tokens at least once. For
# error at most 1e-4, we will need an error of 13950 tokens (by Chernoff bounds).
# TODO(gsmyrnis): Improve this.
assert total <= 1.037142857 * NUM_TOKENS + 13950
assert total >= 1.037142857 * NUM_TOKENS - 13950


@pytest.mark.s3
Expand Down Expand Up @@ -115,3 +121,134 @@ def test_tokenize_shuffle_local_read_local_write():
total += len(x["json.gz"])
assert total == NUM_TOKENS
assert exit_value == 0


@pytest.mark.parametrize("num_sources", [2, 3])
@pytest.mark.parametrize("generation_length", [1024, 2048, 2500])
def test_mixing_no_sampling(num_sources, generation_length):
content_len = 2048
docs_a = 1000
docs_b = 500
docs_c = 2000

# Tokens for gpt-neox tokenizer (default)
token_a = 247
token_b = 270
token_c = 260

# Store some fake sources in ./test_input
os.system("mkdir test_input")
os.system("mkdir test_input/source_a/")
os.system("mkdir test_input/source_b/")
os.system("mkdir test_input/source_c/")
os.system("mkdir test_output")

for i in range(docs_a // 100):
with open(f"test_input/source_a/input_{i:08d}.jsonl", "w") as f:
# This will create 2048 copies of the " a" string
data = {"text": " " + " ".join(["a" for _ in range(generation_length)])}
json_string = json.dumps(data)
for _ in range(100):
f.write(json_string)
f.write("\n")

for i in range(docs_b // 100):
with open(f"test_input/source_b/input_{i:08d}.jsonl", "w") as f:
data = {"text": " " + " ".join(["b" for _ in range(generation_length)])}
json_string = json.dumps(data)
for _ in range(100):
f.write(json_string)
f.write("\n")

if num_sources == 3:
for i in range(docs_c // 100):
with open(f"test_input/source_c/input_{i:08d}.jsonl", "w") as f:
data = {"text": " " + " ".join(["c" for _ in range(generation_length)])}
json_string = json.dumps(data)
for _ in range(100):
f.write(json_string)
f.write("\n")

# run tokenize script
exit_value = os.system(
f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input ./test_input --content_key text --seqlen {content_len} --output ./test_output/"
)

tars = [os.path.join("test_output", fname) for fname in os.listdir("test_output") if fname.endswith(".tar")]
total_a = total_b = total_c = 0
for tar in tars:
ds = wds.WebDataset(tar).decode()
for x in ds:
assert len(x["json.gz"]) == content_len + 1
tokens = np.array(x["json.gz"])
total_a += np.sum(tokens == token_a)
total_b += np.sum(tokens == token_b)
total_c += np.sum(tokens == token_c)

assert total_a == docs_a * generation_length
assert total_b == docs_b * generation_length
if num_sources == 3:
assert total_c == docs_c * generation_length

assert exit_value == 0


@pytest.mark.parametrize("generation_length", [1024, 2048, 2500])
def test_mixing_sampling(generation_length):
content_len = 2048
docs_a = 10000
docs_b = 10000

# Tokens for gpt-neox tokenizer (default)
token_a = 247
token_b = 270

# Store some fake sources in ./test_input
os.system("mkdir test_input")
os.system("mkdir test_input/source_a/")
os.system("mkdir test_input/source_b/")
os.system("mkdir test_output")

for i in range(docs_a // 100):
with open(f"test_input/source_a/input_{i:08d}.jsonl", "w") as f:
# This will create 2048 copies of the " a" string
data = {"text": " " + " ".join(["a" for _ in range(generation_length)])}
json_string = json.dumps(data)
for _ in range(100):
f.write(json_string)
f.write("\n")

for i in range(docs_b // 100):
with open(f"test_input/source_b/input_{i:08d}.jsonl", "w") as f:
data = {"text": " " + " ".join(["b" for _ in range(generation_length)])}
json_string = json.dumps(data)
for _ in range(100):
f.write(json_string)
f.write("\n")

# run tokenize script
exit_value = os.system(
f"python open_lm/datapreprocess/ray/tokenize_shuffle.py --input ./test_input --content_key text --seqlen {content_len} --output ./test_output/ --do_sample --default_dataset_yaml ./tests/assets/test_sample.yaml"
)
assert exit_value == 0

tars = [os.path.join("test_output", fname) for fname in os.listdir("test_output") if fname.endswith(".tar")]
total_a = total_b = 0
for tar in tars:
ds = wds.WebDataset(tar).decode()
for x in ds:
assert len(x["json.gz"]) == content_len + 1
tokens = np.array(x["json.gz"])
total_a += np.sum(tokens == token_a)
total_b += np.sum(tokens == token_b)

# Sampling for source a should be 2.0, so it should be exactly 2
assert total_a == 2 * docs_a * generation_length

# Source b is sampled with probability 0.5, so the number of documents from source b follows Bin(10000, 0.5).
# Via (multiplicative) Chernoff bounds, for margin delta the error probability is 2 * exp(-delta**2 * mu / 3)
# In this case for error probability <= 1e-4, we need delta * mu = sqrt(-3 * ln(0.5e-4) / mu) * mu ~= 386
# TODO (gsmyrnis): I think you can get a better bound here.
mixing_error = 386
assert total_b <= (0.5 * docs_b + mixing_error) * generation_length
assert total_b >= (0.5 * docs_b - mixing_error) * generation_length
Loading