-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split encryption and s3 presigned urls scripts (#1507)
- Loading branch information
1 parent
62653e5
commit d6a244e
Showing
2 changed files
with
100 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/bin/bash | ||
|
||
# Set the usage message | ||
usage="Usage: $0 <dir_path> <s3_uri> <output_file>" | ||
|
||
# Example invocation | ||
# from ipa/input_data_S02/ | ||
# ../scripts/presigned-s3-urls.sh encryptions/1B_cat/30_shards/ s3://stg-ipa-encrypted-reports/testing-sharded-data/1B/30_shards presigned_urls_30_shards.txt | ||
|
||
# Check if the correct number of arguments were provided | ||
if [ $# -ne 3 ]; then | ||
echo "$usage" | ||
exit 1 | ||
fi | ||
|
||
# Set the directory path and S3 URI from the command-line arguments | ||
dir_path="$1" | ||
s3_uri="$2" | ||
output_file="$3" | ||
|
||
# Iterate over the files in the directory | ||
for file in "$dir_path"/*; do | ||
# Get the file name without the directory path | ||
filename=$(basename "$file") | ||
echo "Processing: $(basename "$file")" | ||
# Call the aws s3 presign command and append the output to the output file | ||
# expires in 14 days (14 * 24 * 60 * 60) | ||
aws s3 presign "$s3_uri/$filename" --expires-in 1209600 >> "$output_file" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import argparse | ||
import binascii | ||
import os | ||
|
||
try: | ||
from tqdm import tqdm | ||
except ImportError: | ||
print("tqdm not installed. run `pip install tqdm` to see progress") | ||
|
||
def tqdm(iterable, *args, **kwargs): | ||
return iterable | ||
|
||
|
||
def split_hex_file(input_filename, output_stem, num_files): | ||
""" | ||
Reads in a file of hex strings, one per line, splits it up into N files, | ||
and writes out each line as length-delimited binary data. | ||
:param input_filename: The name of the input file containing hex strings. | ||
:param num_files: The number of output files to split the input into. | ||
""" | ||
output_files = [ | ||
open(f"{output_stem}_shard_{i:03d}.bin", "wb") for i in range(num_files) | ||
] | ||
|
||
input_filesize = os.path.getsize(input_filename) | ||
# estimation each line is about 250 bits | ||
approx_row_count = input_filesize / 250 | ||
with open(input_filename, "r") as input_file: | ||
for i, line in enumerate( | ||
tqdm(input_file, desc="Processing lines", total=approx_row_count) | ||
): | ||
# Remove any leading or trailing whitespace from the line | ||
line = line.strip() | ||
|
||
# Convert the hex string to bytes | ||
data = binascii.unhexlify(line) | ||
|
||
# Write the length of the data as a 2-byte integer (big-endian) | ||
output_files[i % num_files].write(len(data).to_bytes(2, byteorder="little")) | ||
|
||
# Write the data itself | ||
output_files[i % num_files].write(data) | ||
|
||
for f in output_files: | ||
f.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Splits a file of hex strings into N length-delimited binary files" | ||
) | ||
parser.add_argument( | ||
"-i", "--input_file", required=True, help="Input file containing hex strings" | ||
) | ||
parser.add_argument( | ||
"-o", | ||
"--output_stem", | ||
required=True, | ||
help="Output file stem for generated files", | ||
) | ||
parser.add_argument( | ||
"-n", | ||
"--num-files", | ||
type=int, | ||
required=True, | ||
help="Number of output files to split the input into", | ||
) | ||
args = parser.parse_args() | ||
|
||
split_hex_file(args.input_file, args.output_stem, args.num_files) |