diff --git a/LOG.md b/LOG.md index 84ab52b81..9080b2f07 100644 --- a/LOG.md +++ b/LOG.md @@ -9,6 +9,21 @@ That is, within each transformer block we compute `MLP(LN(x)) + Attention(LN(x)) This allows to increase throughput because we can fuse the separate feed-forward and attention input projections into a single linear layer. We also experimented with [fusing the output projections](https://github.com/allenai/LLM/pull/79) into a single linear layer but that didn't help, possibly due to the overhead of concatenating the feed-forward and attention activations together. + +2023-04-02 +---------- + +First training run! We trained a 300M model on about 70B tokens from C4. +The purpose of this model is to give the other LLM teams something in our format that's not completely random, +so they can test their evaluation and inference code. + +This ran on a single node only on AMD's cluster. +On AMD hardware we're still missing Flash Attention, and we could not get `torch.compile()` to work in time for the run. +Both are expected to provide significant speedups. +This training run used model settings that are optimal for compiled models, despite not being able to compile, +because we want it to be a representative model for the downstream evaluations. + + 2023-03-28 ---------- diff --git a/README.md b/README.md index 33e44b371..9b0a3afa8 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,12 @@ gantry run \ This may require a reservation on the Infiniband cluster. See the [Beaker documentation](https://beaker-docs.apps.allenai.org/distributed-training.html) for more information on distributed training. + +## Finding official runs + +We keep all of our runs in WandB under [the "ai2-llm" entity](https://wandb.ai/ai2-llm). +We don't store model checkpoints in WandB. Those are in GCS under `gs://allennlp-olmo/`. + +### Highlighted models + + * 300M parameters, ~70B tokens, a starter model that's not completely random: https://wandb.ai/ai2-llm/LLM-scripts/runs/ed5krfk9 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 90816083c..41b8d0dae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ wandb # triton # flash-attn logzio-python-handler -boto3 \ No newline at end of file +boto3 +google-cloud-storage \ No newline at end of file diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py new file mode 100644 index 000000000..ee41f5a6b --- /dev/null +++ b/scripts/upload_artifact.py @@ -0,0 +1,62 @@ +import logging +from pathlib import Path +from typing import Tuple + +import click +from google.cloud import storage +from tqdm import tqdm + +from dolma.util import prepare_cli_environment + +log = logging.getLogger(__name__) + + +@click.command() +@click.argument( + "wandb_run_path", + type=str, +) +@click.argument( + "files_or_directories", + nargs=-1, + type=click.Path(exists=True, dir_okay=True, path_type=Path), +) +def main( + wandb_run_path: str, + files_or_directories: Tuple[Path], +): + """ + Uploads artifacts to GCS. This uploads to a hardcoded bucket in GCS, because that's where we expect to keep all the artifacts for OLMo. + + WANDB_RUN_PATH: The "Weights and Biases" run path. You get this by going to the run in wandb and clicking on the "copy run path" button. We will use this as the prefix for the paths in the GCS bucket. + """ + storage_client = storage.Client() + bucket = storage_client.bucket("allennlp-olmo", "ai2-allennlp") + prefix = wandb_run_path.strip("/") + + files_or_directories_in_a_special_variable_because_mypy_is_lame = [ + (file_or_directory, prefix + "/" + file_or_directory.name) for file_or_directory in files_or_directories + ] + while len(files_or_directories_in_a_special_variable_because_mypy_is_lame) > 0: + file_or_directory, key = files_or_directories_in_a_special_variable_because_mypy_is_lame.pop() + if file_or_directory.is_file(): + blob = bucket.blob(key) + with file_or_directory.open("rb") as f: + with tqdm.wrapattr( + f, + "read", + total=file_or_directory.stat().st_size, + miniters=1, + desc=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}", + ) as f: + blob.upload_from_file(f, file_or_directory) + elif file_or_directory.is_dir(): + for directory_entry in file_or_directory.iterdir(): + files_or_directories_in_a_special_variable_because_mypy_is_lame.append( + (directory_entry, key + "/" + directory_entry.name) + ) + + +if __name__ == "__main__": + prepare_cli_environment() + main()