Skip to content

Commit

Permalink
Split encryption and s3 presigned urls scripts (#1507)
Browse files Browse the repository at this point in the history
  • Loading branch information
eriktaubeneck authored Dec 18, 2024
1 parent 62653e5 commit d6a244e
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
29 changes: 29 additions & 0 deletions scripts/presigned-s3-urls.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

# Set the usage message
usage="Usage: $0 <dir_path> <s3_uri> <output_file>"

# Example invocation
# from ipa/input_data_S02/
# ../scripts/presigned-s3-urls.sh encryptions/1B_cat/30_shards/ s3://stg-ipa-encrypted-reports/testing-sharded-data/1B/30_shards presigned_urls_30_shards.txt

# Check if the correct number of arguments were provided
if [ $# -ne 3 ]; then
echo "$usage"
exit 1
fi

# Set the directory path and S3 URI from the command-line arguments
dir_path="$1"
s3_uri="$2"
output_file="$3"

# Iterate over the files in the directory
for file in "$dir_path"/*; do
# Get the file name without the directory path
filename=$(basename "$file")
echo "Processing: $(basename "$file")"
# Call the aws s3 presign command and append the output to the output file
# expires in 14 days (14 * 24 * 60 * 60)
aws s3 presign "$s3_uri/$filename" --expires-in 1209600 >> "$output_file"
done
71 changes: 71 additions & 0 deletions scripts/split-encrypted-files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import argparse
import binascii
import os

try:
from tqdm import tqdm
except ImportError:
print("tqdm not installed. run `pip install tqdm` to see progress")

def tqdm(iterable, *args, **kwargs):
return iterable


def split_hex_file(input_filename, output_stem, num_files):
"""
Reads in a file of hex strings, one per line, splits it up into N files,
and writes out each line as length-delimited binary data.
:param input_filename: The name of the input file containing hex strings.
:param num_files: The number of output files to split the input into.
"""
output_files = [
open(f"{output_stem}_shard_{i:03d}.bin", "wb") for i in range(num_files)
]

input_filesize = os.path.getsize(input_filename)
# estimation each line is about 250 bits
approx_row_count = input_filesize / 250
with open(input_filename, "r") as input_file:
for i, line in enumerate(
tqdm(input_file, desc="Processing lines", total=approx_row_count)
):
# Remove any leading or trailing whitespace from the line
line = line.strip()

# Convert the hex string to bytes
data = binascii.unhexlify(line)

# Write the length of the data as a 2-byte integer (big-endian)
output_files[i % num_files].write(len(data).to_bytes(2, byteorder="little"))

# Write the data itself
output_files[i % num_files].write(data)

for f in output_files:
f.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Splits a file of hex strings into N length-delimited binary files"
)
parser.add_argument(
"-i", "--input_file", required=True, help="Input file containing hex strings"
)
parser.add_argument(
"-o",
"--output_stem",
required=True,
help="Output file stem for generated files",
)
parser.add_argument(
"-n",
"--num-files",
type=int,
required=True,
help="Number of output files to split the input into",
)
args = parser.parse_args()

split_hex_file(args.input_file, args.output_stem, args.num_files)

0 comments on commit d6a244e

Please sign in to comment.