Initial move from internal tool to CZ repo

Cloudzero · Mar 6, 2023 · b27e09c · b27e09c
1 parent e9e70ae
commit b27e09c
Show file tree

Hide file tree

Showing 5 changed files with 399 additions and 0 deletions.
diff --git a/python/Dockerfile b/python/Dockerfile
@@ -0,0 +1,10 @@
+FROM public.ecr.aws/lambda/python:3.9
+
+COPY requirements.txt . 
+RUN pip3 install -r requirements.txt --target ${LAMBDA_TASK_ROOT}
+
+COPY lambda.py ${LAMBDA_TASK_ROOT}
+COPY anycostoci.py ${LAMBDA_TASK_ROOT}
+
+CMD ["lambda.anycost"]
+
diff --git a/python/anycostoci.py b/python/anycostoci.py
@@ -0,0 +1,210 @@
+import calendar
+import json
+import re
+import oci
+import datetime
+from datetime import datetime
+from datetime import timedelta
+from datetime import date
+import dateutil
+from dateutil import relativedelta
+import sys
+import os
+import gzip
+import pandas
+
+def __create_or_verify_bdid_folder(start_date: datetime, output_dir: str, drop_id: str):
+    # Snap to the first of the month that start_date is in
+    first_of_the_month = start_date.replace(day=1)
+    # Calculate the last date of that month
+    last_date_of_the_month = calendar.monthrange(start_date.year, start_date.month)[1]
+    # Make a datetime for that last date. 
+    last_of_the_month = start_date.replace(day=last_date_of_the_month)
+    # Add 1 day to get the first of the next month
+    first_of_next_month = last_of_the_month + timedelta(days=1)
+
+    # Build billing data ID - "YYYYMMDD-YYYMMDD" of 1st of data month to 1st of following month
+    bdid_segment_1 = first_of_the_month.date().strftime("%Y%m%d")
+    bdid_segment_2 = first_of_next_month.strftime("%Y%m%d")
+
+    billing_data_id = bdid_segment_1 + "-" + bdid_segment_2
+    bdid_path = os.path.join(output_dir, billing_data_id)
+    drop_path = os.path.join(bdid_path, drop_id)
+
+    if not os.path.exists(drop_path):
+        print(f"Creating directory {drop_path}")
+        os.makedirs(drop_path, exist_ok = True)
+
+    return drop_path
+
+
+def __months_lookback(lookback_months: int) -> dict:
+    """How many previous months to look back and stop. Returns tuple of <date>, <date>"""
+
+    start_date = datetime.utcnow().date() - dateutil.relativedelta.relativedelta(months=lookback_months)
+    start_date = start_date.replace(day=1)
+
+    # end_date = datetime.utcnow().date() - dateutil.relativedelta.relativedelta(months=1)
+    last_day_of_the_month = calendar.monthrange(start_date.year, start_date.month)[1]
+    end_date = start_date.replace(day=last_day_of_the_month)
+
+    print(f"Eval dates: {start_date} to {end_date}")
+    return start_date, end_date
+
+
+# 0 lookback starts from first of current month to today
+# 1 lookback starts from 1st of previous month to last day of previous month
+# 2 lookback starts from 1st of next-previous month to last day of that month
+def download_oci_cost_files(lookback_months: int, oci_config = {}, output_dir = '/tmp/' ) -> slice:
+    """Download OCI cost reports between start_date and end_date. Returns slice of downloaded filenames on success."""
+    oci.config.validate_config(oci_config)
+
+    object_storage = oci.object_storage.ObjectStorageClient(oci_config)
+
+    start_date, end_date = __months_lookback(lookback_months)
+    # Extend the end date since the last day of the month reports in the
+    # following month.
+    report_end_date = end_date + timedelta(days=3)
+
+    # TODO: the point of pagination is to make repeated calls. Push this paginating into the fetch/comparison loop function
+    report_bucket_objects = oci.pagination.list_call_get_all_results(
+                            object_storage.list_objects,
+                            'bling',
+                            oci_config['tenancy'],
+                            fields="name,timeCreated,size",
+                            prefix="reports/cost-csv"
+                            )
+
+    downloaded_reports = []
+    for o in report_bucket_objects.data.objects:
+        # print(f"Report Created: {o.time_created.date()} Earliest date: {datetime.strptime('2022-01-01', '%Y-%m-%d').date()}")
+        if (o.time_created.date() >= start_date and
+            o.time_created.date() <= report_end_date ):
+            this_report = object_storage.get_object('bling', oci_config['tenancy'], o.name)
+            filename_path = os.path.join(output_dir, o.time_created.strftime("%Y%m%d%H%M%S%Z") + ".csv.gz")
+            with open(filename_path, "wb", ) as f:
+                for chunk in this_report.data.raw.stream(1024 * 1024, decode_content=False):
+                    f.write(chunk)
+            downloaded_reports.append(filename_path)
+            print(f"File {filename_path} Downloaded - created {o.time_created}")
+
+
+    return downloaded_reports
+
+def build_anycost_drop_from_oci_files(lookback_months: int,
+                                      oci_cost_files_dir = '/tmp/', 
+                                      output_dir = '/tmp/anycost_drop/') -> slice:
+    """Take a directory of gzipped OCI cost reports and build an AnyCost drop out of them.
+
+    Evaluates the files in oci_cost_files_dir to see their begin/end dates.
+    Creates a CBF-drop-formatted directory and file structure in output_dir.
+    Creates a CBF manifest.json pointing to the new files.
+
+    Returns a set of paths to created billing data ID folders under output_dir
+    """
+    # CBF drop folder structure is like:
+    # output_dir/<billing_data_id>/<drop_id>/<data_file>
+    # Ex:
+    # output_dir/20220101-20220201/20220128000000Z/data_file[0...N].csv.gz
+    # output_dir/20220101-20220201/manifest.json
+
+    start_date, end_date = __months_lookback(lookback_months)
+
+    drop_id = datetime.utcnow().strftime("%Y%m%d%H%M%S%Z")
+    drop_paths = set()
+
+    for root, dirs, cost_files in os.walk(oci_cost_files_dir):
+        # It would be swell if this yielded the files in order. 
+        # The filenames are ordered numbers and we could display progress
+        for cost_file in cost_files: 
+            if not re.match(".*\.csv\.gz$", cost_file):
+                continue
+
+            with gzip.open(os.path.join(root, cost_file), 'rb') as f:
+                print(f"Processing file {root}/{cost_file}...")
+                try:
+                    oci_cost = pandas.read_csv(f)
+                except pandas.errors.EmptyDataError:
+                    print(f"No rows read from file {root}/{cost_file}")
+
+                # Start building the CBF formatted frame
+                cbf_frame = pandas.DataFrame([])
+
+                cbf_frame.insert(0, 'lineitem/id', oci_cost.loc[:, 'lineItem/referenceNo'])
+                # AFAICT all cost types in OCI are 'Usage', with the possible
+                # exception of 'Adjustment's for rows with isCorrection=True.
+                # Depending on how corrections are handled we may not need
+                # to show that.
+                cbf_frame.insert(1, 'lineitem/type', 'Usage')
+                cbf_frame.insert(2, 'lineitem/description', oci_cost.loc[:, 'product/Description'])
+                cbf_frame.insert(3, 'time/usage_start', oci_cost.loc[:, 'lineItem/intervalUsageStart'])
+                cbf_frame.insert(4, 'time/usage_end', oci_cost.loc[:, 'lineItem/intervalUsageEnd'])
+                cbf_frame.insert(5, 'resource/id', oci_cost.loc[:, 'product/resourceId'])
+                cbf_frame.insert(6, 'resource/service', oci_cost.loc[:, 'product/service'])
+                cbf_frame.insert(7, 'resource/account', oci_cost.loc[:, 'lineItem/tenantId'])
+                cbf_frame.insert(8, 'resource/region', oci_cost.loc[:, 'product/region'])
+                cbf_frame.insert(9, 'action/account', oci_cost.loc[:, 'lineItem/tenantId'])
+                cbf_frame.insert(10, 'usage/amount', oci_cost.loc[:, 'usage/billedQuantity'])
+                cbf_frame.insert(11, 'cost/cost', oci_cost.loc[:, 'cost/myCost'])
+
+                #Tags
+                for c in oci_cost.columns:
+                    match = re.match('^tags\/(?P<tag_key>.*)', c)
+                    if match:
+                        oci_tag_key = match.group('tag_key')
+                        oci_tag_key_cleaned = re.sub("[^a-zA-Z0-9\_\.\:\+\@\=\-\/]+", '', oci_tag_key)
+
+                        if len(oci_tag_key) != len(oci_tag_key_cleaned):
+                            print("Warning: Some characters were stripped from OCI tag column.")
+                            print(f"Column '{oci_tag_key}' contained invalid characters.")
+
+                        tag_column = "resource/tag:" + oci_tag_key_cleaned
+                        cbf_frame.insert(len(cbf_frame.columns), tag_column, oci_cost.loc[:, c])
+
+                # This section prunes the CBF frames to contain only rows with
+                # usage_start dates within the BDID boundary.
+
+                # Format the usage timestamps so we can parse them
+                cbf_frame['time/usage_start'] = pandas.to_datetime(cbf_frame['time/usage_start'], cache=True)
+                cbf_frame['time/usage_end']   = pandas.to_datetime(cbf_frame['time/usage_end'], cache=True)
+
+                # Create new date-only timestamp columns so we can look at those for pruning
+                # note the .dt property refers to the datetime object inside the column 
+                cbf_frame['time/usage_start_date'] = cbf_frame['time/usage_start'].dt.date
+                cbf_frame['time/usage_end_date']   = cbf_frame['time/usage_end'].dt.date
+
+                # CBF treats all usage with start dates within the BDID window
+                # as valid for that window. So we look at the start date of
+                # every row to see whether it belongs within the window.
+                if start_date: # conditional here since @start_date is inside the string
+                    cbf_frame.query('`time/usage_start_date` >= @start_date', inplace=True)
+
+                if end_date:
+                    cbf_frame.query('`time/usage_start_date` <= @end_date', inplace=True)
+
+                # Finally, let's drop the _date columns since they don't belong
+                # in the output.
+                cbf_frame.drop(columns=['time/usage_start_date', 'time/usage_end_date'], inplace=True)
+
+                # Dump to disk, assuming we have any rows left
+                if len(cbf_frame) > 0:
+                    drop_path = __create_or_verify_bdid_folder(cbf_frame.head(1).iat[0,3], output_dir, drop_id)
+                    cbf_file_path = os.path.join(output_dir, drop_path, os.path.basename(cost_file))
+                    print(f"Writing CBF file to {cbf_file_path}")
+                    cbf_frame.to_csv(cbf_file_path, index=False)
+                    drop_paths.add(drop_path)
+                else:
+                    print(f"No rows remaining after date window prune in file {cost_file}")
+
+    # Emit manifest pointing at our current drop.
+    # There should only be 1, despite the for statement. 
+    for d in drop_paths:
+        manifest = {
+            "version": "1.0.0",
+            "current_drop_id": drop_id
+        }
+
+        with open(os.path.join(os.path.dirname(d), "manifest.json"), 'w') as f:
+            json.dump(manifest, f)
+
+    return drop_paths
diff --git a/python/cli.py b/python/cli.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3.9
+
+import argparse
+import json
+import oci
+import boto3
+import sys
+import os
+import gzip
+from datetime import datetime
+from datetime import date
+from datetime import timedelta
+import calendar
+import pandas
+import tempfile
+import anycostoci
+
+parser = argparse.ArgumentParser()
+
+# AnyCost side configuration, for writing to s3
+parser.add_argument("-aki", "--access-key-id")
+parser.add_argument("-sak", "--secret-access-key")
+parser.add_argument("-b", "--anycost-s3-bucket")
+
+# OCI side configuration, where is my config file
+# Config should contain user OCID, tenancy OCID, and keyfile location
+parser.add_argument("-o", "--oci-config-file")
+
+# Temp dir stores the OCI cost files, output is the actual drop.
+parser.add_argument("-t", "--temp-dir", default="/tmp/")
+parser.add_argument("-d", "--output-dir", default="/tmp/anycost_drop")
+
+# Months of history to evaluate and store in AnyCost drop
+parser.add_argument("-m", "--lookback-months", default=1, type=int)
+
+args = parser.parse_args()
+
+print(f"Args given: {args}")
+
+# temp_dir = tempfile.TemporaryDirectory(dir=args.temp_dir)
+temp_dir = '/tmp/'
+
+# Filesystem sanity check
+try:
+  oci_write_dir = os.path.join(temp_dir, "oci_cost_files")
+  os.makedirs(oci_write_dir, exist_ok=True)
+  anycost_drop_dir = args.output_dir
+  os.makedirs(anycost_drop_dir)
+except FileExistsError as fee:
+  print(f"Path exists: {anycost_drop_dir}")
+  exit(1)
+
+oci_file = ""
+if args.oci_config_file == None:
+    oci_file = oci.config.DEFAULT_LOCATION
+else:
+    oci_file = args.oci_config_file
+
+# Hydrate OCI config and download --lookback-months worth of cost files
+
+oci_config = oci.config.from_file(oci_file, oci.config.DEFAULT_PROFILE)
+print(f"OCI Config: {oci_config}")
+
+downloaded_reports = anycostoci.download_oci_cost_files(
+                        args.lookback_months, 
+                        oci_config = oci_config,
+                        output_dir = oci_write_dir)
+
+output_paths = anycostoci.build_anycost_drop_from_oci_files(
+  args.lookback_months, 
+  oci_cost_files_dir = oci_write_dir, 
+  output_dir = anycost_drop_dir
+)
+
+print("Created drops in:")
+print(output_paths)
diff --git a/python/lambda.py b/python/lambda.py
@@ -0,0 +1,98 @@
+import anycostoci
+import sys
+import os
+import datetime
+import boto3
+import s3fs
+import oci
+import tempfile
+from datetime import timedelta
+from datetime import datetime
+
+# Call with event data to specify lookback:
+# curl -d '{"lookback_months": 1 }' for example.
+
+def __load_oci_config(params_path: str) -> dict:
+    client = boto3.client('ssm')
+    config = {}
+
+    config['user'] = client.get_parameter(
+        Name = params_path + "oci-user")['Parameter']['Value']
+    config['key_content'] = client.get_parameter(
+        Name = params_path + "oci-key-content",
+        WithDecryption = True )['Parameter']['Value']
+    config['fingerprint'] = client.get_parameter(
+        Name = params_path + "oci-key-fingerprint")['Parameter']['Value']
+    config['tenancy'] = client.get_parameter(
+        Name = params_path + "oci-tenancy" )['Parameter']['Value']
+    config['region'] = client.get_parameter(
+        Name = params_path + "oci-region" )['Parameter']['Value']
+    oci.config.validate_config(config)
+    return config
+
+def anycost(event, context):
+
+    # hydrate the OCI configuration for downloading 
+    params_path = os.environ.get('SSM_PARAMETER_STORE_FOLDER_PATH')
+    oci_config = __load_oci_config(params_path)
+
+    # hydrate the S3 config
+    ssm = boto3.client('ssm')
+    cbf_s3_bucket = ssm.get_parameter(Name=params_path+'s3-bucket')['Parameter']['Value']
+    cbf_s3_prefix = ssm.get_parameter(Name=params_path+'s3-bucket-prefix')['Parameter']['Value']
+
+    # Check event arguments
+    lookback_months = 1
+    if 'lookback_months' in event:
+        lookback_months = event['lookback_months']
+    print(f"Looking back {lookback_months} months ago")
+
+    # temp dir management for Lambda temp storage
+    temp_dir = tempfile.TemporaryDirectory(dir="/tmp/")
+    try: 
+        oci_write_dir = os.path.join(temp_dir.name, "oci_cost_files")
+        os.makedirs(oci_write_dir, exist_ok=True)
+        anycost_drop_dir = os.path.join(temp_dir.name, "anycost_drop")
+        os.makedirs(anycost_drop_dir, exist_ok=True)
+
+        anycostoci.download_oci_cost_files(
+            lookback_months = lookback_months,
+            oci_config = oci_config,
+            output_dir = oci_write_dir
+        )
+
+        output_drops = anycostoci.build_anycost_drop_from_oci_files(
+            lookback_months = lookback_months,
+            oci_cost_files_dir = oci_write_dir,
+            output_dir = anycost_drop_dir,
+        )
+
+        # output_drops:
+        # {'/tmp/tmp425_p9ui/anycost_drop/20230101-20230201/20230130225601'}
+        print(output_drops)
+
+        # walk the output_drops and plop them all in the target s3 bucket        
+        s3 = s3fs.S3FileSystem()
+        for drop in output_drops:
+            drop_id       = os.path.basename(drop)
+            drop_bdid_dir = os.path.dirname(drop)
+            drop_bdid     = os.path.basename(drop_bdid_dir)
+
+            # should be /bucket/prefix/20230101-20230201/
+            s3_drop_path = os.path.join(cbf_s3_bucket, cbf_s3_prefix, drop_bdid, drop_id + "/")
+
+            # when new month used, didn't create drop_id prefix, it just put the files directly in the bdid prefix
+            print(f"Putting dir: {drop} to S3 path {s3_drop_path}")
+            s3.put(drop, s3_drop_path, recursive=True)
+
+            # manifest.json
+            #should be /tmp/tmp425_p9ui/anycost_drop/20230101-20230201/manifest.json
+            manifest_tmp_path = os.path.join(drop_bdid_dir, "manifest.json")
+            manifest_s3_path = os.path.join(cbf_s3_bucket, cbf_s3_prefix, drop_bdid, 'manifest.json')
+            print(f"Putting {manifest_tmp_path} to S3 path {manifest_s3_path}")
+            s3.put_file(manifest_tmp_path, manifest_s3_path)
+
+
+        # cleanup for next invocation
+    finally:
+        temp_dir.cleanup()