Skip to content

Commit

Permalink
flags
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Jan 1, 2025
1 parent df9fca7 commit 9d38f10
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 25 deletions.
28 changes: 14 additions & 14 deletions configs/peteish-anneal/olmoe_mix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,43 @@ target_size: 200G

sources:
- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/dclm/*.npy
mix_percent: 0.5
mix_percent: 0.4922

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/pes2o/*.npy
mix_percent: 0.0585
mix_percent: 0.0652

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/flan/*.npy
mix_percent: 0.1660
mix_percent: 0.1667

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/codesearchnet-owmfilter/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/basic_math/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm_mind/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/gsm8k/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/mathcoder2-synthmath/*/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/metamath-owmfilter/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tinyGSM-MIND/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/math/tulu_math/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/stackexchange/*.npy
sample_percent: 1.0
sample_percent: 2.0

- source: s3://ai2-llm/preprocessed/dolmino-mix-1124/allenai/gpt-neox-olmo-dolma-v1_5/wiki/*.npy
sample_percent: 1.0
sample_percent: 2.0
24 changes: 13 additions & 11 deletions scripts/make_npy_mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ class SourceConfig:
def __post_init__(self):
if self.mix_percent is not None and (self.mix_percent < 0 or self.mix_percent > 1):
raise ValueError("mix_percent must be between 0 and 1")
elif self.sample_percent is not None and (self.sample_percent < 0 or self.sample_percent > 1):
raise ValueError("sample_percent must be between 0 and 1")

@property
def bucket(self) -> str:
Expand Down Expand Up @@ -73,15 +71,19 @@ def sample(self, total_size: int) -> tuple[list[str], int]:
# Randomly sample files
running_size = 0
selected = []
while len(all_paths) > 0:
idx = random.randint(0, len(all_paths) - 1)
path = all_paths.pop(idx)
size = all_sizes.pop(idx)
selected.append(path)

running_size += size
if running_size >= target_size:
break

# double while loop to allow for sampling over 100% if needed
while running_size < target_size:
all_paths_copy, all_sizes_copy = all_paths[:], all_sizes[:]
while len(all_paths_copy) > 0:
idx = random.randint(0, len(all_paths_copy) - 1)
path = all_paths_copy.pop(idx)
size = all_sizes_copy.pop(idx)
selected.append(path)

running_size += size
if running_size >= target_size:
break

return selected, running_size

Expand Down

0 comments on commit 9d38f10

Please sign in to comment.