diff --git a/python/Dockerfile b/python/Dockerfile new file mode 100644 index 0000000..6483625 --- /dev/null +++ b/python/Dockerfile @@ -0,0 +1,10 @@ +FROM public.ecr.aws/lambda/python:3.9 + +COPY requirements.txt . +RUN pip3 install -r requirements.txt --target ${LAMBDA_TASK_ROOT} + +COPY lambda.py ${LAMBDA_TASK_ROOT} +COPY anycostoci.py ${LAMBDA_TASK_ROOT} + +CMD ["lambda.anycost"] + diff --git a/python/anycostoci.py b/python/anycostoci.py new file mode 100644 index 0000000..f5e64af --- /dev/null +++ b/python/anycostoci.py @@ -0,0 +1,210 @@ +import calendar +import json +import re +import oci +import datetime +from datetime import datetime +from datetime import timedelta +from datetime import date +import dateutil +from dateutil import relativedelta +import sys +import os +import gzip +import pandas + +def __create_or_verify_bdid_folder(start_date: datetime, output_dir: str, drop_id: str): + # Snap to the first of the month that start_date is in + first_of_the_month = start_date.replace(day=1) + # Calculate the last date of that month + last_date_of_the_month = calendar.monthrange(start_date.year, start_date.month)[1] + # Make a datetime for that last date. + last_of_the_month = start_date.replace(day=last_date_of_the_month) + # Add 1 day to get the first of the next month + first_of_next_month = last_of_the_month + timedelta(days=1) + + # Build billing data ID - "YYYYMMDD-YYYMMDD" of 1st of data month to 1st of following month + bdid_segment_1 = first_of_the_month.date().strftime("%Y%m%d") + bdid_segment_2 = first_of_next_month.strftime("%Y%m%d") + + billing_data_id = bdid_segment_1 + "-" + bdid_segment_2 + bdid_path = os.path.join(output_dir, billing_data_id) + drop_path = os.path.join(bdid_path, drop_id) + + if not os.path.exists(drop_path): + print(f"Creating directory {drop_path}") + os.makedirs(drop_path, exist_ok = True) + + return drop_path + + +def __months_lookback(lookback_months: int) -> dict: + """How many previous months to look back and stop. Returns tuple of , """ + + start_date = datetime.utcnow().date() - dateutil.relativedelta.relativedelta(months=lookback_months) + start_date = start_date.replace(day=1) + + # end_date = datetime.utcnow().date() - dateutil.relativedelta.relativedelta(months=1) + last_day_of_the_month = calendar.monthrange(start_date.year, start_date.month)[1] + end_date = start_date.replace(day=last_day_of_the_month) + + print(f"Eval dates: {start_date} to {end_date}") + return start_date, end_date + + +# 0 lookback starts from first of current month to today +# 1 lookback starts from 1st of previous month to last day of previous month +# 2 lookback starts from 1st of next-previous month to last day of that month +def download_oci_cost_files(lookback_months: int, oci_config = {}, output_dir = '/tmp/' ) -> slice: + """Download OCI cost reports between start_date and end_date. Returns slice of downloaded filenames on success.""" + oci.config.validate_config(oci_config) + + object_storage = oci.object_storage.ObjectStorageClient(oci_config) + + start_date, end_date = __months_lookback(lookback_months) + # Extend the end date since the last day of the month reports in the + # following month. + report_end_date = end_date + timedelta(days=3) + + # TODO: the point of pagination is to make repeated calls. Push this paginating into the fetch/comparison loop function + report_bucket_objects = oci.pagination.list_call_get_all_results( + object_storage.list_objects, + 'bling', + oci_config['tenancy'], + fields="name,timeCreated,size", + prefix="reports/cost-csv" + ) + + downloaded_reports = [] + for o in report_bucket_objects.data.objects: + # print(f"Report Created: {o.time_created.date()} Earliest date: {datetime.strptime('2022-01-01', '%Y-%m-%d').date()}") + if (o.time_created.date() >= start_date and + o.time_created.date() <= report_end_date ): + this_report = object_storage.get_object('bling', oci_config['tenancy'], o.name) + filename_path = os.path.join(output_dir, o.time_created.strftime("%Y%m%d%H%M%S%Z") + ".csv.gz") + with open(filename_path, "wb", ) as f: + for chunk in this_report.data.raw.stream(1024 * 1024, decode_content=False): + f.write(chunk) + downloaded_reports.append(filename_path) + print(f"File {filename_path} Downloaded - created {o.time_created}") + + + return downloaded_reports + +def build_anycost_drop_from_oci_files(lookback_months: int, + oci_cost_files_dir = '/tmp/', + output_dir = '/tmp/anycost_drop/') -> slice: + """Take a directory of gzipped OCI cost reports and build an AnyCost drop out of them. + + Evaluates the files in oci_cost_files_dir to see their begin/end dates. + Creates a CBF-drop-formatted directory and file structure in output_dir. + Creates a CBF manifest.json pointing to the new files. + + Returns a set of paths to created billing data ID folders under output_dir + """ + # CBF drop folder structure is like: + # output_dir/// + # Ex: + # output_dir/20220101-20220201/20220128000000Z/data_file[0...N].csv.gz + # output_dir/20220101-20220201/manifest.json + + start_date, end_date = __months_lookback(lookback_months) + + drop_id = datetime.utcnow().strftime("%Y%m%d%H%M%S%Z") + drop_paths = set() + + for root, dirs, cost_files in os.walk(oci_cost_files_dir): + # It would be swell if this yielded the files in order. + # The filenames are ordered numbers and we could display progress + for cost_file in cost_files: + if not re.match(".*\.csv\.gz$", cost_file): + continue + + with gzip.open(os.path.join(root, cost_file), 'rb') as f: + print(f"Processing file {root}/{cost_file}...") + try: + oci_cost = pandas.read_csv(f) + except pandas.errors.EmptyDataError: + print(f"No rows read from file {root}/{cost_file}") + + # Start building the CBF formatted frame + cbf_frame = pandas.DataFrame([]) + + cbf_frame.insert(0, 'lineitem/id', oci_cost.loc[:, 'lineItem/referenceNo']) + # AFAICT all cost types in OCI are 'Usage', with the possible + # exception of 'Adjustment's for rows with isCorrection=True. + # Depending on how corrections are handled we may not need + # to show that. + cbf_frame.insert(1, 'lineitem/type', 'Usage') + cbf_frame.insert(2, 'lineitem/description', oci_cost.loc[:, 'product/Description']) + cbf_frame.insert(3, 'time/usage_start', oci_cost.loc[:, 'lineItem/intervalUsageStart']) + cbf_frame.insert(4, 'time/usage_end', oci_cost.loc[:, 'lineItem/intervalUsageEnd']) + cbf_frame.insert(5, 'resource/id', oci_cost.loc[:, 'product/resourceId']) + cbf_frame.insert(6, 'resource/service', oci_cost.loc[:, 'product/service']) + cbf_frame.insert(7, 'resource/account', oci_cost.loc[:, 'lineItem/tenantId']) + cbf_frame.insert(8, 'resource/region', oci_cost.loc[:, 'product/region']) + cbf_frame.insert(9, 'action/account', oci_cost.loc[:, 'lineItem/tenantId']) + cbf_frame.insert(10, 'usage/amount', oci_cost.loc[:, 'usage/billedQuantity']) + cbf_frame.insert(11, 'cost/cost', oci_cost.loc[:, 'cost/myCost']) + + #Tags + for c in oci_cost.columns: + match = re.match('^tags\/(?P.*)', c) + if match: + oci_tag_key = match.group('tag_key') + oci_tag_key_cleaned = re.sub("[^a-zA-Z0-9\_\.\:\+\@\=\-\/]+", '', oci_tag_key) + + if len(oci_tag_key) != len(oci_tag_key_cleaned): + print("Warning: Some characters were stripped from OCI tag column.") + print(f"Column '{oci_tag_key}' contained invalid characters.") + + tag_column = "resource/tag:" + oci_tag_key_cleaned + cbf_frame.insert(len(cbf_frame.columns), tag_column, oci_cost.loc[:, c]) + + # This section prunes the CBF frames to contain only rows with + # usage_start dates within the BDID boundary. + + # Format the usage timestamps so we can parse them + cbf_frame['time/usage_start'] = pandas.to_datetime(cbf_frame['time/usage_start'], cache=True) + cbf_frame['time/usage_end'] = pandas.to_datetime(cbf_frame['time/usage_end'], cache=True) + + # Create new date-only timestamp columns so we can look at those for pruning + # note the .dt property refers to the datetime object inside the column + cbf_frame['time/usage_start_date'] = cbf_frame['time/usage_start'].dt.date + cbf_frame['time/usage_end_date'] = cbf_frame['time/usage_end'].dt.date + + # CBF treats all usage with start dates within the BDID window + # as valid for that window. So we look at the start date of + # every row to see whether it belongs within the window. + if start_date: # conditional here since @start_date is inside the string + cbf_frame.query('`time/usage_start_date` >= @start_date', inplace=True) + + if end_date: + cbf_frame.query('`time/usage_start_date` <= @end_date', inplace=True) + + # Finally, let's drop the _date columns since they don't belong + # in the output. + cbf_frame.drop(columns=['time/usage_start_date', 'time/usage_end_date'], inplace=True) + + # Dump to disk, assuming we have any rows left + if len(cbf_frame) > 0: + drop_path = __create_or_verify_bdid_folder(cbf_frame.head(1).iat[0,3], output_dir, drop_id) + cbf_file_path = os.path.join(output_dir, drop_path, os.path.basename(cost_file)) + print(f"Writing CBF file to {cbf_file_path}") + cbf_frame.to_csv(cbf_file_path, index=False) + drop_paths.add(drop_path) + else: + print(f"No rows remaining after date window prune in file {cost_file}") + + # Emit manifest pointing at our current drop. + # There should only be 1, despite the for statement. + for d in drop_paths: + manifest = { + "version": "1.0.0", + "current_drop_id": drop_id + } + + with open(os.path.join(os.path.dirname(d), "manifest.json"), 'w') as f: + json.dump(manifest, f) + + return drop_paths diff --git a/python/cli.py b/python/cli.py new file mode 100755 index 0000000..41a258a --- /dev/null +++ b/python/cli.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3.9 + +import argparse +import json +import oci +import boto3 +import sys +import os +import gzip +from datetime import datetime +from datetime import date +from datetime import timedelta +import calendar +import pandas +import tempfile +import anycostoci + +parser = argparse.ArgumentParser() + +# AnyCost side configuration, for writing to s3 +parser.add_argument("-aki", "--access-key-id") +parser.add_argument("-sak", "--secret-access-key") +parser.add_argument("-b", "--anycost-s3-bucket") + +# OCI side configuration, where is my config file +# Config should contain user OCID, tenancy OCID, and keyfile location +parser.add_argument("-o", "--oci-config-file") + +# Temp dir stores the OCI cost files, output is the actual drop. +parser.add_argument("-t", "--temp-dir", default="/tmp/") +parser.add_argument("-d", "--output-dir", default="/tmp/anycost_drop") + +# Months of history to evaluate and store in AnyCost drop +parser.add_argument("-m", "--lookback-months", default=1, type=int) + +args = parser.parse_args() + +print(f"Args given: {args}") + +# temp_dir = tempfile.TemporaryDirectory(dir=args.temp_dir) +temp_dir = '/tmp/' + +# Filesystem sanity check +try: + oci_write_dir = os.path.join(temp_dir, "oci_cost_files") + os.makedirs(oci_write_dir, exist_ok=True) + anycost_drop_dir = args.output_dir + os.makedirs(anycost_drop_dir) +except FileExistsError as fee: + print(f"Path exists: {anycost_drop_dir}") + exit(1) + +oci_file = "" +if args.oci_config_file == None: + oci_file = oci.config.DEFAULT_LOCATION +else: + oci_file = args.oci_config_file + +# Hydrate OCI config and download --lookback-months worth of cost files + +oci_config = oci.config.from_file(oci_file, oci.config.DEFAULT_PROFILE) +print(f"OCI Config: {oci_config}") + +downloaded_reports = anycostoci.download_oci_cost_files( + args.lookback_months, + oci_config = oci_config, + output_dir = oci_write_dir) + +output_paths = anycostoci.build_anycost_drop_from_oci_files( + args.lookback_months, + oci_cost_files_dir = oci_write_dir, + output_dir = anycost_drop_dir +) + +print("Created drops in:") +print(output_paths) \ No newline at end of file diff --git a/python/lambda.py b/python/lambda.py new file mode 100644 index 0000000..3ca5ad0 --- /dev/null +++ b/python/lambda.py @@ -0,0 +1,98 @@ +import anycostoci +import sys +import os +import datetime +import boto3 +import s3fs +import oci +import tempfile +from datetime import timedelta +from datetime import datetime + +# Call with event data to specify lookback: +# curl -d '{"lookback_months": 1 }' for example. + +def __load_oci_config(params_path: str) -> dict: + client = boto3.client('ssm') + config = {} + + config['user'] = client.get_parameter( + Name = params_path + "oci-user")['Parameter']['Value'] + config['key_content'] = client.get_parameter( + Name = params_path + "oci-key-content", + WithDecryption = True )['Parameter']['Value'] + config['fingerprint'] = client.get_parameter( + Name = params_path + "oci-key-fingerprint")['Parameter']['Value'] + config['tenancy'] = client.get_parameter( + Name = params_path + "oci-tenancy" )['Parameter']['Value'] + config['region'] = client.get_parameter( + Name = params_path + "oci-region" )['Parameter']['Value'] + oci.config.validate_config(config) + return config + +def anycost(event, context): + + # hydrate the OCI configuration for downloading + params_path = os.environ.get('SSM_PARAMETER_STORE_FOLDER_PATH') + oci_config = __load_oci_config(params_path) + + # hydrate the S3 config + ssm = boto3.client('ssm') + cbf_s3_bucket = ssm.get_parameter(Name=params_path+'s3-bucket')['Parameter']['Value'] + cbf_s3_prefix = ssm.get_parameter(Name=params_path+'s3-bucket-prefix')['Parameter']['Value'] + + # Check event arguments + lookback_months = 1 + if 'lookback_months' in event: + lookback_months = event['lookback_months'] + print(f"Looking back {lookback_months} months ago") + + # temp dir management for Lambda temp storage + temp_dir = tempfile.TemporaryDirectory(dir="/tmp/") + try: + oci_write_dir = os.path.join(temp_dir.name, "oci_cost_files") + os.makedirs(oci_write_dir, exist_ok=True) + anycost_drop_dir = os.path.join(temp_dir.name, "anycost_drop") + os.makedirs(anycost_drop_dir, exist_ok=True) + + anycostoci.download_oci_cost_files( + lookback_months = lookback_months, + oci_config = oci_config, + output_dir = oci_write_dir + ) + + output_drops = anycostoci.build_anycost_drop_from_oci_files( + lookback_months = lookback_months, + oci_cost_files_dir = oci_write_dir, + output_dir = anycost_drop_dir, + ) + + # output_drops: + # {'/tmp/tmp425_p9ui/anycost_drop/20230101-20230201/20230130225601'} + print(output_drops) + + # walk the output_drops and plop them all in the target s3 bucket + s3 = s3fs.S3FileSystem() + for drop in output_drops: + drop_id = os.path.basename(drop) + drop_bdid_dir = os.path.dirname(drop) + drop_bdid = os.path.basename(drop_bdid_dir) + + # should be /bucket/prefix/20230101-20230201/ + s3_drop_path = os.path.join(cbf_s3_bucket, cbf_s3_prefix, drop_bdid, drop_id + "/") + + # when new month used, didn't create drop_id prefix, it just put the files directly in the bdid prefix + print(f"Putting dir: {drop} to S3 path {s3_drop_path}") + s3.put(drop, s3_drop_path, recursive=True) + + # manifest.json + #should be /tmp/tmp425_p9ui/anycost_drop/20230101-20230201/manifest.json + manifest_tmp_path = os.path.join(drop_bdid_dir, "manifest.json") + manifest_s3_path = os.path.join(cbf_s3_bucket, cbf_s3_prefix, drop_bdid, 'manifest.json') + print(f"Putting {manifest_tmp_path} to S3 path {manifest_s3_path}") + s3.put_file(manifest_tmp_path, manifest_s3_path) + + + # cleanup for next invocation + finally: + temp_dir.cleanup() \ No newline at end of file diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..043be4a --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,5 @@ +pandas +oci +boto3 +s3fs +python-dateutil \ No newline at end of file