Split encryption and s3 presigned urls scripts (#1507)

private-attribution · Dec 18, 2024 · d6a244e · d6a244e
1 parent 62653e5
commit d6a244e
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 0 deletions.
diff --git a/scripts/presigned-s3-urls.sh b/scripts/presigned-s3-urls.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Set the usage message
+usage="Usage: $0 <dir_path> <s3_uri> <output_file>"
+
+# Example invocation
+# from ipa/input_data_S02/
+# ../scripts/presigned-s3-urls.sh encryptions/1B_cat/30_shards/ s3://stg-ipa-encrypted-reports/testing-sharded-data/1B/30_shards presigned_urls_30_shards.txt
+
+# Check if the correct number of arguments were provided
+if [ $# -ne 3 ]; then
+  echo "$usage"
+  exit 1
+fi
+
+# Set the directory path and S3 URI from the command-line arguments
+dir_path="$1"
+s3_uri="$2"
+output_file="$3"
+
+# Iterate over the files in the directory
+for file in "$dir_path"/*; do
+  # Get the file name without the directory path
+  filename=$(basename "$file")
+  echo "Processing: $(basename "$file")"
+  # Call the aws s3 presign command and append the output to the output file
+  # expires in 14 days (14 * 24 * 60 * 60)
+  aws s3 presign "$s3_uri/$filename" --expires-in 1209600 >> "$output_file"
+done
diff --git a/scripts/split-encrypted-files.py b/scripts/split-encrypted-files.py
@@ -0,0 +1,71 @@
+import argparse
+import binascii
+import os
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    print("tqdm not installed. run `pip install tqdm` to see progress")
+
+    def tqdm(iterable, *args, **kwargs):
+        return iterable
+
+
+def split_hex_file(input_filename, output_stem, num_files):
+    """
+    Reads in a file of hex strings, one per line, splits it up into N files,
+    and writes out each line as length-delimited binary data.
+
+    :param input_filename: The name of the input file containing hex strings.
+    :param num_files: The number of output files to split the input into.
+    """
+    output_files = [
+        open(f"{output_stem}_shard_{i:03d}.bin", "wb") for i in range(num_files)
+    ]
+
+    input_filesize = os.path.getsize(input_filename)
+    # estimation each line is about 250 bits
+    approx_row_count = input_filesize / 250
+    with open(input_filename, "r") as input_file:
+        for i, line in enumerate(
+            tqdm(input_file, desc="Processing lines", total=approx_row_count)
+        ):
+            # Remove any leading or trailing whitespace from the line
+            line = line.strip()
+
+            # Convert the hex string to bytes
+            data = binascii.unhexlify(line)
+
+            # Write the length of the data as a 2-byte integer (big-endian)
+            output_files[i % num_files].write(len(data).to_bytes(2, byteorder="little"))
+
+            # Write the data itself
+            output_files[i % num_files].write(data)
+
+    for f in output_files:
+        f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Splits a file of hex strings into N length-delimited binary files"
+    )
+    parser.add_argument(
+        "-i", "--input_file", required=True, help="Input file containing hex strings"
+    )
+    parser.add_argument(
+        "-o",
+        "--output_stem",
+        required=True,
+        help="Output file stem for generated files",
+    )
+    parser.add_argument(
+        "-n",
+        "--num-files",
+        type=int,
+        required=True,
+        help="Number of output files to split the input into",
+    )
+    args = parser.parse_args()
+
+    split_hex_file(args.input_file, args.output_stem, args.num_files)