From a1ce5112d05c558ccdea3a01661a35de68e3cd01 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 14:08:32 +0100 Subject: [PATCH 01/33] prefix and group private helpers --- src/sc_crawler/schemas.py | 6 +- src/sc_crawler/vendors/aws.py | 508 +++++++++++++++++----------------- 2 files changed, 260 insertions(+), 254 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 3d883a9d..d3207f69 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -229,8 +229,8 @@ def get_zones(self): """Get zones of the vendor from its datacenters.""" return self._methods.get_zones(self) - def get_instance_types(self): - return self._methods.get_instance_types(self) + def get_servers(self): + return self._methods.get_servers(self) def get_prices(self): return self._methods.get_prices(self) @@ -238,7 +238,7 @@ def get_prices(self): def get_all(self): self.get_datacenters() self.get_zones() - self.get_instance_types() + self.get_servers() self.get_prices() return diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index a86a0b4d..cd863b88 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -15,11 +15,11 @@ set_default_params(caching_enabled=False, stale_after=timedelta(days=1)) # ############################################################################## -# AWS cached helpers +# Cached boto3 wrappers @cachier() -def describe_instance_types(region): +def _boto_describe_instance_types(region): ec2 = boto3.client("ec2", region_name=region) pages = ec2.get_paginator("describe_instance_types") pages = pages.paginate().build_full_result() @@ -27,13 +27,13 @@ def describe_instance_types(region): @cachier() -def describe_regions(): +def _boto_describe_regions(): ec2 = boto3.client("ec2") return ec2.describe_regions().get("Regions", []) @cachier() -def describe_availability_zones(region): +def _boto_describe_availability_zones(region): ec2 = boto3.client("ec2", region_name=region) zones = ec2.describe_availability_zones( Filters=[ @@ -45,7 +45,7 @@ def describe_availability_zones(region): @cachier() -def get_price_list(region): +def _boto_price_list(region): """Download published AWS price lists. Currently unused.""" # pricing API is only available in a few regions client = boto3.client("pricing", region_name="us-east-1") @@ -62,7 +62,7 @@ def get_price_list(region): @cachier() -def get_products(): +def _boto_get_products(): # pricing API is only available in a few regions client = boto3.client("pricing", region_name="us-east-1") filters = { @@ -92,6 +92,245 @@ def get_products(): # ############################################################################## +# Internal helpers + +_instance_families = { + "a": "AWS Graviton", + "c": "Compute optimized", + "d": "Dense storage", + "dl": "Deep Learning", + "f": "FPGA", + "g": "Graphics intensive", + "h": "Cost-effective storage optimized with HDD", + "hpc": "High performance computing", + "i": "Storage optimized", + "im": "Storage optimized with a one to four ratio of vCPU to memory", + "is": "Storage optimized with a one to six ratio of vCPU to memory", + "inf": "AWS Inferentia", + "m": "General purpose", + "mac": "macOS", + "p": "GPU accelerated", + "r": "Memory optimized", + "t": "Burstable performance", + "trn": "AWS Trainium", + "u": "High memory", + "vt": "Video transcoding", + "x": "Memory intensive", + "z": "High frequency", +} + +_instance_suffixes = { + # Processor families + "a": "AMD processors", + "g": "AWS Graviton processors", + "i": "Intel processors", + # Additional capabilities + "d": "Instance store volumes", + "n": "Network and EBS optimized", + "e": "Extra storage or memory", + "z": "High performance", + "q": "Qualcomm inference accelerators", + "flex": "Flex instance", +} + + +def _annotate_instance_type(instance_type_id): + """Resolve instance type coding to human-friendly description. + + Source: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#instance-type-names + """ # noqa: E501 + kind = instance_type_id.split(".")[0] + # drop X TB suffix after instance family + if kind.startswith("u"): + logger.warning(f"Removing X TB reference from instance family: {kind}") + kind = re.sub(r"^u-([0-9]*)tb", "u", kind) + # drop suffixes for now after the dash, e.g. "Mac2-m2", "Mac2-m2pro" + if "-" in kind: + logger.warning(f"Truncating instance type after the dash: {kind}") + kind = kind.split("-")[0] + family, extras = re.split(r"[0-9]", kind) + generation = re.findall(r"[0-9]", kind)[0] + size = instance_type_id.split(".")[1] + + try: + text = _instance_families[family] + except KeyError as exc: + raise KeyError( + "Unknown instance family: " + family + " (e.g. " + instance_type_id + ")" + ) from exc + for k, v in _instance_suffixes.items(): + if k in extras: + text += " [" + v + "]" + text += " Gen" + generation + text += " " + size + + return text + + +def _get_storage_of_instance_type(instance_type, nvme=False): + """Get overall storage size and type (tupple) from instance details.""" + if "InstanceStorageInfo" not in instance_type: + return (0, None) + info = instance_type["InstanceStorageInfo"] + storage_size = info["TotalSizeInGB"] + storage_type = info["Disks"][0].get("Type").lower() + if storage_type == "ssd" and info.get("NvmeSupport", False): + storage_type = "nvme ssd" + return (storage_size, storage_type) + + +def _array_expand_by_count(array): + """Expand an array with its items Count field.""" + array = [[a] * a["Count"] for a in array] + return list(chain(*array)) + + +def _get_storages_of_instance_type(instance_type): + """Get individual storages as an array.""" + if "InstanceStorageInfo" not in instance_type: + return [] + info = instance_type["InstanceStorageInfo"] + + def to_storage(disk, nvme=False): + kind = disk.get("Type").lower() + if kind == "ssd" and nvme: + kind = "nvme ssd" + return Disk(size=disk["SizeInGB"], storage_type=kind) + + # replicate number of disks + disks = info["Disks"] + disks = _array_expand_by_count(disks) + return [to_storage(disk, nvme=info.get("NvmeSupport", False)) for disk in disks] + + +def _get_gpu_of_instance_type(instance_type): + """Get overall GPU count, memory and manufacturer/name.""" + if "GpuInfo" not in instance_type: + return (0, None, None) + info = instance_type["GpuInfo"] + memory = info["TotalGpuMemoryInMiB"] + + def mn(gpu): + return gpu["Manufacturer"] + " " + gpu["Name"] + + # iterate over each GPU + count = sum([gpu["Count"] for gpu in info["Gpus"]]) + names = ", ".join([mn(gpu) for gpu in info["Gpus"]]) + return (count, memory, names) + + +def _get_gpus_of_instance_type(instance_type): + """Get individual GPUs as an array.""" + if "GpuInfo" not in instance_type: + return [] + info = instance_type["GpuInfo"] + + def to_gpu(gpu): + return Gpu( + manufacturer=gpu["Manufacturer"], + name=gpu["Name"], + memory=gpu["MemoryInfo"]["SizeInMiB"], + ) + + # replicate number of disks + gpus = info["Gpus"] + gpus = _array_expand_by_count(gpus) + return [to_gpu(gpu) for gpu in gpus] + + +def _make_server_from_instance_type(instance_type, vendor): + """Create a SQLModel Server instance from AWS raw API response.""" + it = instance_type["InstanceType"] + vcpu_info = instance_type["VCpuInfo"] + cpu_info = instance_type["ProcessorInfo"] + gpu_info = _get_gpu_of_instance_type(instance_type) + storage_info = _get_storage_of_instance_type(instance_type) + network_card = instance_type["NetworkInfo"]["NetworkCards"][0] + # avoid duplicates + if it not in [s.id for s in vendor.servers]: + return Server( + id=it, + vendor=vendor, + name=it, + description=_annotate_instance_type(it), + vcpus=vcpu_info["DefaultVCpus"], + cpu_cores=vcpu_info["DefaultCores"], + cpu_speed=cpu_info.get("SustainedClockSpeedInGhz", None), + cpu_architecture=cpu_info["SupportedArchitectures"][0], + cpu_manufacturer=cpu_info.get("Manufacturer", None), + memory=instance_type["MemoryInfo"]["SizeInMiB"], + gpu_count=gpu_info[0], + gpu_memory=gpu_info[1], + gpu_name=gpu_info[2], + gpus=_get_gpus_of_instance_type(instance_type), + storage_size=storage_info[0], + storage_type=storage_info[1], + storages=_get_storages_of_instance_type(instance_type), + network_speed=network_card["BaselineBandwidthInGbps"], + billable_unit="hour", + ) + + +def _list_instance_types_of_region(region, vendor): + """List all available instance types of an AWS region.""" + logger.debug(f"Looking up instance types in region {region}") + instance_types = _boto_describe_instance_types(region) + return [ + _make_server_from_instance_type(instance_type, vendor) + for instance_type in instance_types + ] + + +def _extract_ondemand_price(terms): + """Extract ondmand price and the currency from AWS Terms object.""" + ondemand_term = list(terms["OnDemand"].values())[0] + ondemand_pricing = list(ondemand_term["priceDimensions"].values())[0] + ondemand_pricing = ondemand_pricing["pricePerUnit"] + if "USD" in ondemand_pricing.keys(): + return (float(ondemand_pricing["USD"]), "USD") + # get the first currency if USD not found + return (float(list(ondemand_pricing.values())[0]), list(ondemand_pricing)[0]) + + +def _make_price_from_product(product, vendor): + attributes = product["product"]["attributes"] + location = attributes["location"] + location_type = attributes["locationType"] + instance_type = attributes["instanceType"] + try: + datacenter = [ + d for d in vendor.datacenters if location == d.name or location in d.aliases + ][0] + except IndexError: + logger.debug(f"No AWS region found for location: {location} [{location_type}]") + return + except Exception as exc: + raise exc + try: + server = [ + d for d in vendor.servers if d.vendor == vendor and d.id == instance_type + ][0] + except IndexError: + logger.debug(f"No server definition found for {instance_type} @ {location}") + return + except Exception as exc: + raise exc + price = _extract_ondemand_price(product["terms"]) + return ServerPrice( + vendor=vendor, + datacenter=datacenter, + server=server, + # TODO ingest other OSs + operating_system="Linux", + allocation="ondemand", + price=price[0] * 100, + currency=price[1], + duration=Duration.HOUR, + ) + + +# ############################################################################## +# Public methods to fetch data def get_datacenters(vendor, *args, **kwargs): @@ -395,7 +634,7 @@ def get_datacenters(vendor, *args, **kwargs): # look for undocumented (new) regions in AWS supported_regions = [d.id for d in datacenters] - regions = describe_regions() + regions = _boto_describe_regions() for region in regions: region_name = region["RegionName"] if "gov" in region_name: @@ -430,7 +669,7 @@ def get_zones(vendor, *args, **kwargs): datacenter=datacenter, vendor=vendor, ) - for zone in describe_availability_zones(datacenter.id) + for zone in _boto_describe_availability_zones(datacenter.id) ] for datacenter in vendor.datacenters if datacenter.status == "active" @@ -439,194 +678,7 @@ def get_zones(vendor, *args, **kwargs): return ChainMap(*zones) -instance_families = { - "a": "AWS Graviton", - "c": "Compute optimized", - "d": "Dense storage", - "dl": "Deep Learning", - "f": "FPGA", - "g": "Graphics intensive", - "h": "Cost-effective storage optimized with HDD", - "hpc": "High performance computing", - "i": "Storage optimized", - "im": "Storage optimized with a one to four ratio of vCPU to memory", - "is": "Storage optimized with a one to six ratio of vCPU to memory", - "inf": "AWS Inferentia", - "m": "General purpose", - "mac": "macOS", - "p": "GPU accelerated", - "r": "Memory optimized", - "t": "Burstable performance", - "trn": "AWS Trainium", - "u": "High memory", - "vt": "Video transcoding", - "x": "Memory intensive", - "z": "High frequency", -} - -instance_suffixes = { - # Processor families - "a": "AMD processors", - "g": "AWS Graviton processors", - "i": "Intel processors", - # Additional capabilities - "d": "Instance store volumes", - "n": "Network and EBS optimized", - "e": "Extra storage or memory", - "z": "High performance", - "q": "Qualcomm inference accelerators", - "flex": "Flex instance", -} - - -def annotate_instance_type(instance_type_id): - """Resolve instance type coding to human-friendly description. - - Source: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#instance-type-names - """ # noqa: E501 - kind = instance_type_id.split(".")[0] - # drop X TB suffix after instance family - if kind.startswith("u"): - logger.warning(f"Removing X TB reference from instance family: {kind}") - kind = re.sub(r"^u-([0-9]*)tb", "u", kind) - # drop suffixes for now after the dash, e.g. "Mac2-m2", "Mac2-m2pro" - if "-" in kind: - logger.warning(f"Truncating instance type after the dash: {kind}") - kind = kind.split("-")[0] - family, extras = re.split(r"[0-9]", kind) - generation = re.findall(r"[0-9]", kind)[0] - size = instance_type_id.split(".")[1] - - try: - text = instance_families[family] - except KeyError as exc: - raise KeyError( - "Unknown instance family: " + family + " (e.g. " + instance_type_id + ")" - ) from exc - for k, v in instance_suffixes.items(): - if k in extras: - text += " [" + v + "]" - text += " Gen" + generation - text += " " + size - - return text - - -def get_storage(instance_type, nvme=False): - """Get overall storage size and type (tupple) from instance details.""" - if "InstanceStorageInfo" not in instance_type: - return (0, None) - info = instance_type["InstanceStorageInfo"] - storage_size = info["TotalSizeInGB"] - storage_type = info["Disks"][0].get("Type").lower() - if storage_type == "ssd" and info.get("NvmeSupport", False): - storage_type = "nvme ssd" - return (storage_size, storage_type) - - -def array_expand_by_count(array): - """Expand an array with its items Count field.""" - array = [[a] * a["Count"] for a in array] - return list(chain(*array)) - - -def get_storages(instance_type): - """Get individual storages as an array.""" - if "InstanceStorageInfo" not in instance_type: - return [] - info = instance_type["InstanceStorageInfo"] - - def to_storage(disk, nvme=False): - kind = disk.get("Type").lower() - if kind == "ssd" and nvme: - kind = "nvme ssd" - return Disk(size=disk["SizeInGB"], storage_type=kind) - - # replicate number of disks - disks = info["Disks"] - disks = array_expand_by_count(disks) - return [to_storage(disk, nvme=info.get("NvmeSupport", False)) for disk in disks] - - -def get_gpu(instance_type): - """Get overall GPU count, memory and manufacturer/name.""" - if "GpuInfo" not in instance_type: - return (0, None, None) - info = instance_type["GpuInfo"] - memory = info["TotalGpuMemoryInMiB"] - - def mn(gpu): - return gpu["Manufacturer"] + " " + gpu["Name"] - - # iterate over each GPU - count = sum([gpu["Count"] for gpu in info["Gpus"]]) - names = ", ".join([mn(gpu) for gpu in info["Gpus"]]) - return (count, memory, names) - - -def get_gpus(instance_type): - """Get individual GPUs as an array.""" - if "GpuInfo" not in instance_type: - return [] - info = instance_type["GpuInfo"] - - def to_gpu(gpu): - return Gpu( - manufacturer=gpu["Manufacturer"], - name=gpu["Name"], - memory=gpu["MemoryInfo"]["SizeInMiB"], - ) - - # replicate number of disks - gpus = info["Gpus"] - gpus = array_expand_by_count(gpus) - return [to_gpu(gpu) for gpu in gpus] - - -def server_from_instance_type(instance_type, vendor): - """Create a SQLModel Server instance from AWS raw API response.""" - it = instance_type["InstanceType"] - vcpu_info = instance_type["VCpuInfo"] - cpu_info = instance_type["ProcessorInfo"] - gpu_info = get_gpu(instance_type) - storage_info = get_storage(instance_type) - network_card = instance_type["NetworkInfo"]["NetworkCards"][0] - # avoid duplicates - if it not in [s.id for s in vendor.servers]: - return Server( - id=it, - vendor=vendor, - name=it, - description=annotate_instance_type(it), - vcpus=vcpu_info["DefaultVCpus"], - cpu_cores=vcpu_info["DefaultCores"], - cpu_speed=cpu_info.get("SustainedClockSpeedInGhz", None), - cpu_architecture=cpu_info["SupportedArchitectures"][0], - cpu_manufacturer=cpu_info.get("Manufacturer", None), - memory=instance_type["MemoryInfo"]["SizeInMiB"], - gpu_count=gpu_info[0], - gpu_memory=gpu_info[1], - gpu_name=gpu_info[2], - gpus=get_gpus(instance_type), - storage_size=storage_info[0], - storage_type=storage_info[1], - storages=get_storages(instance_type), - network_speed=network_card["BaselineBandwidthInGbps"], - billable_unit="hour", - ) - - -def instance_types_of_region(region, vendor): - """List all available instance types of an AWS region.""" - logger.debug(f"Looking up instance types in region {region}") - instance_types = describe_instance_types(region) - return [ - server_from_instance_type(instance_type, vendor) - for instance_type in instance_types - ] - - -def get_instance_types(vendor, *args, **kwargs): +def get_servers(vendor): # TODO drop this in favor of pricing.get_products, as it has info e.g. on instanceFamily # although other fields are messier (e.g. extract memory from string) regions = [ @@ -635,67 +687,21 @@ def get_instance_types(vendor, *args, **kwargs): if datacenter.status == "active" ] # might be instance types specific to a few or even a single region - instance_types = [instance_types_of_region(region, vendor) for region in regions] + instance_types = [ + _list_instance_types_of_region(region, vendor) for region in regions + ] return list(chain(*instance_types)) -def extract_ondemand_price(terms): - """Extract ondmand price and the currency from AWS Terms object.""" - ondemand_term = list(terms["OnDemand"].values())[0] - ondemand_pricing = list(ondemand_term["priceDimensions"].values())[0] - ondemand_pricing = ondemand_pricing["pricePerUnit"] - if "USD" in ondemand_pricing.keys(): - return (float(ondemand_pricing["USD"]), "USD") - # get the first currency if USD not found - return (float(list(ondemand_pricing.values())[0]), list(ondemand_pricing)[0]) - - -def price_from_product(product, vendor): - attributes = product["product"]["attributes"] - location = attributes["location"] - location_type = attributes["locationType"] - instance_type = attributes["instanceType"] - try: - datacenter = [ - d for d in vendor.datacenters if location == d.name or location in d.aliases - ][0] - except IndexError: - logger.debug(f"No AWS region found for location: {location} [{location_type}]") - return - except Exception as exc: - raise exc - try: - server = [ - d for d in vendor.servers if d.vendor == vendor and d.id == instance_type - ][0] - except IndexError: - logger.debug(f"No server definition found for {instance_type} @ {location}") - return - except Exception as exc: - raise exc - price = extract_ondemand_price(product["terms"]) - return ServerPrice( - vendor=vendor, - datacenter=datacenter, - server=server, - # TODO ingest other OSs - operating_system="Linux", - allocation="ondemand", - price=price[0], - currency=price[1], - duration=Duration.HOUR, - ) - - def get_prices(vendor, *args, **kwargs): - products = get_products() + products = _boto_get_products() logger.debug(f"Found {len(products)} products") - # return [price_from_product(product, vendor) for product in products] for product in products: # drop Gov regions if "GovCloud" not in product["product"]["attributes"]["location"]: - price_from_product(product, vendor) + _make_price_from_product(product, vendor) + - # TODO store raw response - # TODO reserved pricing options - might decide not to, as not in scope? - # TODO spot prices +# TODO store raw response +# TODO reserved pricing options - might decide not to, as not in scope? +# TODO spot prices From a949e1a4670072ddf2303cfcf398bfe7995662d3 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:02:24 +0100 Subject: [PATCH 02/33] doc how to add a new vendor --- src/sc_crawler/vendors/README.md | 64 ++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/sc_crawler/vendors/README.md diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md new file mode 100644 index 00000000..5fc1c110 --- /dev/null +++ b/src/sc_crawler/vendors/README.md @@ -0,0 +1,64 @@ +## Vendor-specific crawler tools + +Each file in this folder provides the required helpers for a given vendor, named as the identifier of the vendor. +For example, `aws.py` provides functions to be used by its `Vendor` instance, called `aws`. + +Each file should provide the below functions: + +- `get_compliance_frameworks`: define `VendorComplianceLink` instances +- `get_datacenters`: define `Datacenter` instances +- `get_zones`: define `Zone` instances +- `get_servers`: define `Server` instances +- `get_server_prices`: define `ServerPrice` instances +- `get_storage_prices`: define `StoragePrice` instances +- `get_traffic_prices`: define `TrafficPrice` instances +- `get_ipv4_prices`: define `Ipv4Price` instances + +Each function will be picked up as `Vendor` instance methods, so each function should take a single argument, that is the `Vendor` instance. No need to return the objects -- it's enough to define the above-mentioned instances. + +There are also a `get_prices` and `get_all` instance method defined for each `Vendor`, which wrappers call the pricing-related or all the above helpers in the above-listed order. + +If a helper is not needed (e.g. another helper already provides its output, or there are no spot prices), it is still required, but can return early, e.g. if `Zone` objects were populated by `get_datacenters` already, do something like: + +```python +def get_zones(self): + """Zones were already provided in get_datacenters.""" + pass +``` + +Other functions and variables must be prefixed with an underscore to suggest those are internal tools. + +## Template file for new vendors + +```python +def get_compliance_frameworks(vendor): + pass + + +def get_datacenters(vendor): + pass + + +def get_zones(vendor): + pass + + +def get_servers(vendor): + pass + + +def get_server_prices(vendor): + pass + + +def get_storage_prices(vendor): + pass + + +def get_traffic_prices(vendor): + pass + + +def get_ipv4_prices(vendor): + pass +``` From 2da612453da077b57c49f7763de500b1fd67643c Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:04:19 +0100 Subject: [PATCH 03/33] move compliance framework <> vendor constant mapping under vendor methods --- src/sc_crawler/schemas.py | 7 ++++++- src/sc_crawler/vendors/aws.py | 20 ++++++++++++++++++-- src/sc_crawler/vendors/vendors.py | 15 ++------------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index d3207f69..0cad9c6c 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -221,12 +221,16 @@ def __init__(self, **kwargs): except Exception as exc: raise NotImplementedError("Unsupported vendor") from exc + def get_compliance_frameworks(self): + """Get compliance frameworks of the vendor.""" + return self._methods.get_compliance_frameworks(self) + def get_datacenters(self): """Get datacenters of the vendor.""" return self._methods.get_datacenters(self) def get_zones(self): - """Get zones of the vendor from its datacenters.""" + """Get zones of the vendor in its datacenters.""" return self._methods.get_zones(self) def get_servers(self): @@ -236,6 +240,7 @@ def get_prices(self): return self._methods.get_prices(self) def get_all(self): + self.get_compliance_frameworks() self.get_datacenters() self.get_zones() self.get_servers() diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index cd863b88..3a25581b 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -8,8 +8,17 @@ from cachier import cachier, set_default_params from ..logger import logger -from ..lookup import countries -from ..schemas import Datacenter, Disk, Duration, Gpu, Server, ServerPrice, Zone +from ..lookup import countries, compliance_frameworks +from ..schemas import ( + VendorComplianceLink, + Datacenter, + Disk, + Duration, + Gpu, + Server, + ServerPrice, + Zone, +) # disable caching by default set_default_params(caching_enabled=False, stale_after=timedelta(days=1)) @@ -333,6 +342,13 @@ def _make_price_from_product(product, vendor): # Public methods to fetch data +def get_compliance_frameworks(vendor): + for cf in ["hipaa", "soc2t2"]: + VendorComplianceLink( + vendor=vendor, compliance_framework=compliance_frameworks[cf] + ) + + def get_datacenters(vendor, *args, **kwargs): """List all available AWS datacenters. diff --git a/src/sc_crawler/vendors/vendors.py b/src/sc_crawler/vendors/vendors.py index 80285a0b..45cc794f 100644 --- a/src/sc_crawler/vendors/vendors.py +++ b/src/sc_crawler/vendors/vendors.py @@ -1,10 +1,5 @@ -from ..lookup import compliance_frameworks, countries -from ..schemas import Vendor, VendorComplianceLink - - -def get_compliance_frameworks(framework_ids): - return [v for k, v in compliance_frameworks.items() if k in framework_ids] - +from ..lookup import countries +from ..schemas import Vendor aws = Vendor( id="aws", @@ -19,9 +14,6 @@ def get_compliance_frameworks(framework_ids): status_page="https://health.aws.amazon.com/health/status", ) -for cf in ["hipaa", "soc2t2"]: - VendorComplianceLink(vendor=aws, compliance_framework=compliance_frameworks[cf]) - gcp = Vendor( id="gcp", name="Google Cloud Platform", @@ -34,6 +26,3 @@ def get_compliance_frameworks(framework_ids): founding_year=2008, status_page="https://status.cloud.google.com/", ) - -for cf in ["hipaa", "soc2t2"]: - VendorComplianceLink(vendor=gcp, compliance_framework=compliance_frameworks[cf]) From d7b3b5153a6caff4827dd730db582a8249762a0b Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:07:55 +0100 Subject: [PATCH 04/33] def all price fns/methods --- src/sc_crawler/schemas.py | 17 ++++++++++++++++- src/sc_crawler/vendors/aws.py | 14 +++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 0cad9c6c..e39f2587 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -236,8 +236,23 @@ def get_zones(self): def get_servers(self): return self._methods.get_servers(self) + def get_server_prices(self): + return self._methods.get_server_prices(self) + + def get_storage_prices(self): + return self._methods.get_storage_prices(self) + + def get_traffic_prices(self): + return self._methods.get_traffic_prices(self) + + def get_ipv4_prices(self): + return self._methods.get_ipv4_prices(self) + def get_prices(self): - return self._methods.get_prices(self) + self.get_server_prices() + self.get_storage_prices() + self.get_traffic_prices() + self.get_ipv4_prices() def get_all(self): self.get_compliance_frameworks() diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 3a25581b..b6fde09b 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -709,7 +709,7 @@ def get_servers(vendor): return list(chain(*instance_types)) -def get_prices(vendor, *args, **kwargs): +def get_server_prices(vendor): products = _boto_get_products() logger.debug(f"Found {len(products)} products") for product in products: @@ -718,6 +718,18 @@ def get_prices(vendor, *args, **kwargs): _make_price_from_product(product, vendor) +def get_storage_prices(vendor): + pass + + +def get_traffic_prices(vendor): + pass + + +def get_ipv4_prices(vendor): + pass + + # TODO store raw response # TODO reserved pricing options - might decide not to, as not in scope? # TODO spot prices From 70a035637f10d62dddcb0ea7339414994695778c Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:09:12 +0100 Subject: [PATCH 05/33] standardize fn/method input and return value --- src/sc_crawler/schemas.py | 17 ++++++++--------- src/sc_crawler/vendors/aws.py | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index e39f2587..d272b017 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -223,30 +223,30 @@ def __init__(self, **kwargs): def get_compliance_frameworks(self): """Get compliance frameworks of the vendor.""" - return self._methods.get_compliance_frameworks(self) + self._methods.get_compliance_frameworks(self) def get_datacenters(self): """Get datacenters of the vendor.""" - return self._methods.get_datacenters(self) + self._methods.get_datacenters(self) def get_zones(self): """Get zones of the vendor in its datacenters.""" - return self._methods.get_zones(self) + self._methods.get_zones(self) def get_servers(self): - return self._methods.get_servers(self) + self._methods.get_servers(self) def get_server_prices(self): - return self._methods.get_server_prices(self) + self._methods.get_server_prices(self) def get_storage_prices(self): - return self._methods.get_storage_prices(self) + self._methods.get_storage_prices(self) def get_traffic_prices(self): - return self._methods.get_traffic_prices(self) + self._methods.get_traffic_prices(self) def get_ipv4_prices(self): - return self._methods.get_ipv4_prices(self) + self._methods.get_ipv4_prices(self) def get_prices(self): self.get_server_prices() @@ -260,7 +260,6 @@ def get_all(self): self.get_zones() self.get_servers() self.get_prices() - return class Datacenter(ScModel, table=True): diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index b6fde09b..3487c2d7 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -349,7 +349,7 @@ def get_compliance_frameworks(vendor): ) -def get_datacenters(vendor, *args, **kwargs): +def get_datacenters(vendor): """List all available AWS datacenters. Some data sources are not available from APIs, and were collected manually: @@ -675,7 +675,7 @@ def get_datacenters(vendor, *args, **kwargs): return datacenters -def get_zones(vendor, *args, **kwargs): +def get_zones(vendor): """List all available AWS availability zones.""" zones = [ [ From 539d8c54e0ac1e609ada670980ecdf4d08b16e1b Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:26:23 +0100 Subject: [PATCH 06/33] new helper for spot server prices --- src/sc_crawler/schemas.py | 13 ++++++++++--- src/sc_crawler/vendors/README.md | 25 +++++++++++++++---------- src/sc_crawler/vendors/aws.py | 4 ++++ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index d272b017..85e3d889 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -222,23 +222,29 @@ def __init__(self, **kwargs): raise NotImplementedError("Unsupported vendor") from exc def get_compliance_frameworks(self): - """Get compliance frameworks of the vendor.""" + """Get the vendor's all compliance frameworks.""" self._methods.get_compliance_frameworks(self) def get_datacenters(self): - """Get datacenters of the vendor.""" + """Get the vendor's all datacenters.""" self._methods.get_datacenters(self) def get_zones(self): - """Get zones of the vendor in its datacenters.""" + """Get all the zones in the vendor's datacenters.""" self._methods.get_zones(self) def get_servers(self): + """Get the vendor's all server types.""" self._methods.get_servers(self) def get_server_prices(self): + """Get the current standard/ondemand/reserved prices of all server types.""" self._methods.get_server_prices(self) + def get_server_prices_spot(self): + """Get the current sport prices of all server types.""" + self._methods.get_server_prices_spot(self) + def get_storage_prices(self): self._methods.get_storage_prices(self) @@ -250,6 +256,7 @@ def get_ipv4_prices(self): def get_prices(self): self.get_server_prices() + self.get_server_prices_spot() self.get_storage_prices() self.get_traffic_prices() self.get_ipv4_prices() diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index 5fc1c110..0b779786 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -5,16 +5,17 @@ For example, `aws.py` provides functions to be used by its `Vendor` instance, ca Each file should provide the below functions: -- `get_compliance_frameworks`: define `VendorComplianceLink` instances -- `get_datacenters`: define `Datacenter` instances -- `get_zones`: define `Zone` instances -- `get_servers`: define `Server` instances -- `get_server_prices`: define `ServerPrice` instances -- `get_storage_prices`: define `StoragePrice` instances -- `get_traffic_prices`: define `TrafficPrice` instances -- `get_ipv4_prices`: define `Ipv4Price` instances - -Each function will be picked up as `Vendor` instance methods, so each function should take a single argument, that is the `Vendor` instance. No need to return the objects -- it's enough to define the above-mentioned instances. +- `get_compliance_frameworks`: Define `VendorComplianceLink` instances to describe which frameworks the vendor complies with. Optionally include references in the `comment` field. +- `get_datacenters`: Define `Datacenter` instances with location, energy source etc for each region/datacenter the vendor has. +- `get_zones`: Define a `Zone` instance for each availability zone of the vendor in each datacenter. +- `get_servers`: Define `Server` instances for the vendor's server/instance types. +- `get_server_prices`: Define `ServerPrice` instances for the standard/ondemand and optionally also for the reserved pricing of the instance types, optionally per datacenter and zone. If a price is the same in multiple zones/datacenters, fill those columns with an empty string (these primary keys cannot be `NULL`). +- `get_server_prices_spot`: Similar to the above, define `ServerPrice` instances but the `allocation` field set to `Allocation.SPOT`. Very likely to see different spot prices per datacenter/zone. +- `get_storage_prices`: Define `StoragePrice` instances to describe the available storage options that can be attached to the servers. +- `get_traffic_prices`: Define `TrafficPrice` instances to describe the pricing of ingress/egress traffic. +- `get_ipv4_prices`: Define `Ipv4Price` instances on the price of an IPv4 address. + +Each function will be picked up as the related `Vendor` instance's instance methods, so each function should take a single argument, that is the `Vendor` instance. No need to return the objects -- it's enough to define the above-mentioned instances. There are also a `get_prices` and `get_all` instance method defined for each `Vendor`, which wrappers call the pricing-related or all the above helpers in the above-listed order. @@ -51,6 +52,10 @@ def get_server_prices(vendor): pass +def get_server_prices_spot(vendor): + pass + + def get_storage_prices(vendor): pass diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 3487c2d7..abe90548 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -718,6 +718,10 @@ def get_server_prices(vendor): _make_price_from_product(product, vendor) +def get_server_prices_spot(vendor): + pass + + def get_storage_prices(vendor): pass From 0853a94e4cd7cfdd0fe73658a5825dc7eb874088 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 15:29:40 +0100 Subject: [PATCH 07/33] hardcode ipv4 price --- src/sc_crawler/vendors/aws.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index abe90548..b1c388a4 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -15,6 +15,7 @@ Disk, Duration, Gpu, + Ipv4Price, Server, ServerPrice, Zone, @@ -731,7 +732,7 @@ def get_traffic_prices(vendor): def get_ipv4_prices(vendor): - pass + Ipv4Price(vendor=vendor, price=0.005, duration=Duration.HOUR) # TODO store raw response From 3e5843dfb7938bbc3f9453b27f88fbc5f5b0db4c Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 16:48:51 +0100 Subject: [PATCH 08/33] drop zone from storage/traffic/ipv4 prices, but require datacenter --- src/sc_crawler/schemas.py | 32 ++++++++++++++++---------------- src/sc_crawler/vendors/README.md | 2 +- src/sc_crawler/vendors/aws.py | 3 ++- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 85e3d889..e45034e3 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -308,9 +308,6 @@ class Zone(ScModel, table=True): datacenter: Datacenter = Relationship(back_populates="zones") vendor: Vendor = Relationship(back_populates="zones") server_prices: List["ServerPrice"] = Relationship(back_populates="zone") - traffic_prices: List["TrafficPrice"] = Relationship(back_populates="zone") - ipv4_prices: List["Ipv4Price"] = Relationship(back_populates="zone") - storage_prices: List["StoragePrice"] = Relationship(back_populates="zone") class StorageType(str, Enum): @@ -500,15 +497,16 @@ class PriceTier(Json): # helper classes to inherit for most commonly used fields -class HasVendor(ScModel): +class HasVendorPK(ScModel): vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) -class HasVendorOptionalDatacenterZone(HasVendor): - datacenter_id: str = Field( - default="", foreign_key="datacenter.id", primary_key=True - ) - zone_id: str = Field(default="", foreign_key="zone.id", primary_key=True) +class HasDatacenterPK(ScModel): + datacenter_id: str = Field(foreign_key="datacenter.id", primary_key=True) + + +class HasZonePK(ScModel): + zone_id: str = Field(foreign_key="zone.id", primary_key=True) class HasServer(ScModel): @@ -540,7 +538,12 @@ class ServerPriceExtraFields(ScModel): class ServerPriceBase( - HasPriceFields, ServerPriceExtraFields, HasServer, HasVendorOptionalDatacenterZone + HasPriceFields, + ServerPriceExtraFields, + HasServer, + HasZonePK, + HasDatacenterPK, + HasVendorPK, ): pass @@ -552,36 +555,33 @@ class ServerPrice(ServerPriceBase, table=True): server: Server = Relationship(back_populates="prices") -class StoragePriceBase(HasPriceFields, HasStorage, HasVendorOptionalDatacenterZone): +class StoragePriceBase(HasPriceFields, HasStorage, HasDatacenterPK, HasVendorPK): pass class StoragePrice(StoragePriceBase, table=True): vendor: Vendor = Relationship(back_populates="storage_prices") datacenter: Datacenter = Relationship(back_populates="storage_prices") - zone: Zone = Relationship(back_populates="storage_prices") storage: Storage = Relationship(back_populates="prices") -class TrafficPriceBase(HasPriceFields, HasTraffic, HasVendorOptionalDatacenterZone): +class TrafficPriceBase(HasPriceFields, HasTraffic, HasDatacenterPK, HasVendorPK): pass class TrafficPrice(TrafficPriceBase, table=True): vendor: Vendor = Relationship(back_populates="traffic_prices") datacenter: Datacenter = Relationship(back_populates="traffic_prices") - zone: Zone = Relationship(back_populates="traffic_prices") traffic: Traffic = Relationship(back_populates="prices") -class Ipv4PriceBase(HasPriceFields, HasVendorOptionalDatacenterZone): +class Ipv4PriceBase(HasPriceFields, HasDatacenterPK, HasVendorPK): pass class Ipv4Price(Ipv4PriceBase, table=True): vendor: Vendor = Relationship(back_populates="ipv4_prices") datacenter: Datacenter = Relationship(back_populates="ipv4_prices") - zone: Zone = Relationship(back_populates="ipv4_prices") VendorComplianceLink.model_rebuild() diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index 0b779786..1e9da025 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -9,7 +9,7 @@ Each file should provide the below functions: - `get_datacenters`: Define `Datacenter` instances with location, energy source etc for each region/datacenter the vendor has. - `get_zones`: Define a `Zone` instance for each availability zone of the vendor in each datacenter. - `get_servers`: Define `Server` instances for the vendor's server/instance types. -- `get_server_prices`: Define `ServerPrice` instances for the standard/ondemand and optionally also for the reserved pricing of the instance types, optionally per datacenter and zone. If a price is the same in multiple zones/datacenters, fill those columns with an empty string (these primary keys cannot be `NULL`). +- `get_server_prices`: Define the `ServerPrice` instances for the standard/ondemand (or optionally also for the reserved) pricing of the instance types per datacenter and zone. - `get_server_prices_spot`: Similar to the above, define `ServerPrice` instances but the `allocation` field set to `Allocation.SPOT`. Very likely to see different spot prices per datacenter/zone. - `get_storage_prices`: Define `StoragePrice` instances to describe the available storage options that can be attached to the servers. - `get_traffic_prices`: Define `TrafficPrice` instances to describe the pricing of ingress/egress traffic. diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index b1c388a4..82586cab 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -732,7 +732,8 @@ def get_traffic_prices(vendor): def get_ipv4_prices(vendor): - Ipv4Price(vendor=vendor, price=0.005, duration=Duration.HOUR) + for zone in vendor.zones: + Ipv4Price(vendor=vendor, price=0.005, duration=Duration.HOUR, zone=zone) # TODO store raw response From 5aa3299e795066097c7ef92ff081c2b6ac51c5c7 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 16:49:51 +0100 Subject: [PATCH 09/33] include example imports --- src/sc_crawler/vendors/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index 1e9da025..f0a92d52 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -32,6 +32,19 @@ Other functions and variables must be prefixed with an underscore to suggest tho ## Template file for new vendors ```python +from ..schemas import ( + VendorComplianceLink, + Datacenter, + Disk, + Duration, + Gpu, + Ipv4Price, + Server, + ServerPrice, + Zone, +) + + def get_compliance_frameworks(vendor): pass From 9d6c87f859029940b91b16133bfc14fb80347e1e Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 16:50:15 +0100 Subject: [PATCH 10/33] fix set zone for server prices, datacenter for ipv4 prices --- src/sc_crawler/vendors/aws.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 82586cab..3fe20187 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -326,17 +326,19 @@ def _make_price_from_product(product, vendor): except Exception as exc: raise exc price = _extract_ondemand_price(product["terms"]) - return ServerPrice( - vendor=vendor, - datacenter=datacenter, - server=server, - # TODO ingest other OSs - operating_system="Linux", - allocation="ondemand", - price=price[0] * 100, - currency=price[1], - duration=Duration.HOUR, - ) + for zone in datacenter.zones: + ServerPrice( + vendor=vendor, + datacenter=datacenter, + zone=zone, + server=server, + # TODO ingest other OSs + operating_system="Linux", + allocation="ondemand", + price=price[0] * 100, + currency=price[1], + duration=Duration.HOUR, + ) # ############################################################################## @@ -732,8 +734,10 @@ def get_traffic_prices(vendor): def get_ipv4_prices(vendor): - for zone in vendor.zones: - Ipv4Price(vendor=vendor, price=0.005, duration=Duration.HOUR, zone=zone) + for datacenter in vendor.datacenters: + Ipv4Price( + vendor=vendor, price=0.005, duration=Duration.HOUR, datacenter=datacenter + ) # TODO store raw response From b0ecb40175b0257b3b5abb762871de07ea7d32c0 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 20:35:51 +0100 Subject: [PATCH 11/33] generalize _boto_get_products --- src/sc_crawler/schemas.py | 2 +- src/sc_crawler/utils.py | 23 ++++++++++++++----- src/sc_crawler/vendors/aws.py | 42 ++++++++++++++++++++++------------- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index e45034e3..cb689610 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -242,7 +242,7 @@ def get_server_prices(self): self._methods.get_server_prices(self) def get_server_prices_spot(self): - """Get the current sport prices of all server types.""" + """Get the current spot prices of all server types.""" self._methods.get_server_prices_spot(self) def get_storage_prices(self): diff --git a/src/sc_crawler/utils.py b/src/sc_crawler/utils.py index 4a69b6d9..9949a0fd 100644 --- a/src/sc_crawler/utils.py +++ b/src/sc_crawler/utils.py @@ -8,6 +8,22 @@ from .schemas import tables +def jsoned_hash(*args, **kwargs): + """Hash the JSON-dump of all positional and keyword arguments. + + Examples: + >>> jsoned_hash(42) + '0211c62419aece235ba19582d3cf7fd8e25f837c' + >>> jsoned_hash(everything=42) + '8f8a7fcade8cb632b856f46fc64c1725ee387617' + >>> jsoned_hash(42, 42, everything=42) + 'f04a77f000d85929b13de04b436c60a1272dfbf5' + """ + return sha1( + dumps({"args": args, "kwargs": kwargs}, sort_keys=True).encode() + ).hexdigest() + + class HashLevels(Enum): DATABASE = "database" TABLE = "table" @@ -38,12 +54,9 @@ def hash_database( } if level == HashLevels.TABLE: - hashes = { - k: sha1(dumps(v, sort_keys=True).encode()).hexdigest() - for k, v in hashes.items() - } + hashes = {k: jsoned_hash(v) for k, v in hashes.items()} if level == HashLevels.DATABASE: - hashes = sha1(dumps(hashes, sort_keys=True).encode()).hexdigest() + hashes = jsoned_hash(hashes) return hashes diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 3fe20187..2cc931d2 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -20,6 +20,7 @@ ServerPrice, Zone, ) +from ..utils import jsoned_hash # disable caching by default set_default_params(caching_enabled=False, stale_after=timedelta(days=1)) @@ -71,29 +72,25 @@ def _boto_price_list(region): return price_list_url -@cachier() -def _boto_get_products(): +@cachier(hash_func=jsoned_hash) +def _boto_get_products(service_code: str, filters: dict): + """Get products from AWS with auto-paging. + + Args: + service_code: AWS ServiceCode, e.g. `AmazonEC2` + filters: `dict` of key/value pairs for `TERM_MATCH` filters + """ # pricing API is only available in a few regions client = boto3.client("pricing", region_name="us-east-1") - filters = { - # TODO ingest win, mac etc others - "operatingSystem": "Linux", - "preInstalledSw": "NA", - "licenseModel": "No License required", - "locationType": "AWS Region", - "capacitystatus": "Used", - "marketoption": "OnDemand", - # TODO dedicated options? - "tenancy": "Shared", - } - filters = [ + + matched_filters = [ {"Type": "TERM_MATCH", "Field": k, "Value": v} for k, v in filters.items() ] paginator = client.get_paginator("get_products") # return actual list instead of an iterator to be able to cache on disk products = [] - for page in paginator.paginate(ServiceCode="AmazonEC2", Filters=filters): + for page in paginator.paginate(ServiceCode=service_code, Filters=matched_filters): for product_json in page["PriceList"]: product = json.loads(product_json) products.append(product) @@ -713,7 +710,20 @@ def get_servers(vendor): def get_server_prices(vendor): - products = _boto_get_products() + products = _boto_get_products( + service_code="AmazonEC2", + filters={ + # TODO ingest win, mac etc others + "operatingSystem": "Linux", + "preInstalledSw": "NA", + "licenseModel": "No License required", + "locationType": "AWS Region", + "capacitystatus": "Used", + "marketoption": "OnDemand", + # TODO dedicated options? + "tenancy": "Shared", + }, + ) logger.debug(f"Found {len(products)} products") for product in products: # drop Gov regions From 5e936103d8c23ea9913007b485aba7e9b79b9f3b Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 22:21:31 +0100 Subject: [PATCH 12/33] get IPv4 pricing from AWS API --- src/sc_crawler/schemas.py | 10 +++---- src/sc_crawler/vendors/aws.py | 55 ++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index cb689610..7cd34500 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -322,13 +322,12 @@ class Storage(ScModel, table=True): vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) name: str description: Optional[str] - size: int = 0 + size: int = 0 # GiB storage_type: StorageType max_iops: Optional[int] = None max_throughput: Optional[int] = None # MiB/s min_size: Optional[int] = None # GiB max_size: Optional[int] = None # GiB - billable_unit: str = "GiB" status: Status = Status.ACTIVE vendor: Vendor = Relationship(back_populates="storages") @@ -346,7 +345,6 @@ class Traffic(ScModel, table=True): name: str description: Optional[str] direction: TrafficDirection - billable_unit: str = "GB" status: Status = Status.ACTIVE vendor: Vendor = Relationship(back_populates="traffics") @@ -482,10 +480,12 @@ class Allocation(str, Enum): SPOT = "spot" -class Duration(str, Enum): +class PriceUnit(str, Enum): YEAR = "year" MONTH = "month" HOUR = "hour" + GIB = "GiB" + GB = "GB" class PriceTier(Json): @@ -522,6 +522,7 @@ class HasTraffic(ScModel): class HasPriceFields(ScModel): + unit: PriceUnit # set to max price if tiered price: float # e.g. setup fee for dedicated servers, @@ -529,7 +530,6 @@ class HasPriceFields(ScModel): price_upfront: float = 0 price_tiered: List[PriceTier] = Field(default=[], sa_type=JSON) currency: str = "USD" - duration: Duration class ServerPriceExtraFields(ScModel): diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 2cc931d2..2b077fe2 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -13,7 +13,7 @@ VendorComplianceLink, Datacenter, Disk, - Duration, + PriceUnit, Gpu, Ipv4Price, Server, @@ -95,6 +95,7 @@ def _boto_get_products(service_code: str, filters: dict): product = json.loads(product_json) products.append(product) + logger.debug(f"Found {len(products)} {service_code} products") return products @@ -299,20 +300,33 @@ def _extract_ondemand_price(terms): return (float(list(ondemand_pricing.values())[0]), list(ondemand_pricing)[0]) -def _make_price_from_product(product, vendor): +def _get_product_datacenter(product, vendor): attributes = product["product"]["attributes"] location = attributes["location"] location_type = attributes["locationType"] - instance_type = attributes["instanceType"] try: datacenter = [ d for d in vendor.datacenters if location == d.name or location in d.aliases ][0] except IndexError: - logger.debug(f"No AWS region found for location: {location} [{location_type}]") + raise IndexError( + f"No AWS region found for location: {location} [{location_type}]" + ) + return datacenter + + +def _make_price_from_product(product, vendor): + attributes = product["product"]["attributes"] + location = attributes["location"] + location_type = attributes["locationType"] + instance_type = attributes["instanceType"] + + try: + datacenter = _get_product_datacenter(product, vendor) + except IndexError as e: + logger.debug(str(e)) return - except Exception as exc: - raise exc + try: server = [ d for d in vendor.servers if d.vendor == vendor and d.id == instance_type @@ -320,8 +334,7 @@ def _make_price_from_product(product, vendor): except IndexError: logger.debug(f"No server definition found for {instance_type} @ {location}") return - except Exception as exc: - raise exc + price = _extract_ondemand_price(product["terms"]) for zone in datacenter.zones: ServerPrice( @@ -332,9 +345,9 @@ def _make_price_from_product(product, vendor): # TODO ingest other OSs operating_system="Linux", allocation="ondemand", - price=price[0] * 100, + price=price[0], currency=price[1], - duration=Duration.HOUR, + unit=PriceUnit.HOUR, ) @@ -724,7 +737,6 @@ def get_server_prices(vendor): "tenancy": "Shared", }, ) - logger.debug(f"Found {len(products)} products") for product in products: # drop Gov regions if "GovCloud" not in product["product"]["attributes"]["location"]: @@ -744,9 +756,26 @@ def get_traffic_prices(vendor): def get_ipv4_prices(vendor): - for datacenter in vendor.datacenters: + products = _boto_get_products( + service_code="AmazonVPC", + filters={ + "group": "VPCPublicIPv4Address", + "usagetype": "EUS1-PublicIPv4:InUseAddress", + }, + ) + for product in products: + try: + datacenter = _get_product_datacenter(product, vendor) + except IndexError as e: + logger.debug(str(e)) + continue + price = _extract_ondemand_price(product["terms"]) Ipv4Price( - vendor=vendor, price=0.005, duration=Duration.HOUR, datacenter=datacenter + vendor=vendor, + price=price[0], + currency=price[1], + unit=PriceUnit.HOUR, + datacenter=datacenter, ) From 7a2183a0a9c256353320fc6e2fbb546f18fa6b5f Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 22:35:39 +0100 Subject: [PATCH 13/33] log module/fn name --- src/sc_crawler/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index 1039106f..2209bfc4 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -108,7 +108,7 @@ def custom_serializer(x): # enable logging channel = logging.StreamHandler() formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + "%(asctime)s - %(name)s/%(module)s:%(funcName)s - %(levelname)s - %(message)s" ) channel.setFormatter(formatter) logger.setLevel(log_level.value) From c77a1062b826322a2bda444a162651fc9333a0c6 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 22:35:51 +0100 Subject: [PATCH 14/33] need to filter by decription that is not tied to region --- src/sc_crawler/vendors/aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 2b077fe2..0bbfcfb3 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -760,7 +760,7 @@ def get_ipv4_prices(vendor): service_code="AmazonVPC", filters={ "group": "VPCPublicIPv4Address", - "usagetype": "EUS1-PublicIPv4:InUseAddress", + "groupDescription": "Hourly charge for In-use Public IPv4 Addresses", }, ) for product in products: From 529a45a30bb5af9978a944d7e4c4f2eafa196e80 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 22:39:26 +0100 Subject: [PATCH 15/33] lint --- src/sc_crawler/vendors/aws.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 0bbfcfb3..1ff265d0 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -8,16 +8,16 @@ from cachier import cachier, set_default_params from ..logger import logger -from ..lookup import countries, compliance_frameworks +from ..lookup import compliance_frameworks, countries from ..schemas import ( - VendorComplianceLink, Datacenter, Disk, - PriceUnit, Gpu, Ipv4Price, + PriceUnit, Server, ServerPrice, + VendorComplianceLink, Zone, ) from ..utils import jsoned_hash @@ -318,7 +318,6 @@ def _get_product_datacenter(product, vendor): def _make_price_from_product(product, vendor): attributes = product["product"]["attributes"] location = attributes["location"] - location_type = attributes["locationType"] instance_type = attributes["instanceType"] try: From 5cb165ecd3bce276af237b3a1f029b4773793e6b Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 22:46:42 +0100 Subject: [PATCH 16/33] add only if vendor was not added yet, merge otherwise --- src/sc_crawler/cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index 2209bfc4..43b5dffa 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -6,7 +6,7 @@ import typer from cachier import set_default_params -from sqlmodel import Session, SQLModel, create_engine +from sqlmodel import Session, SQLModel, create_engine, select from typing_extensions import Annotated from . import vendors as vendors_module @@ -131,7 +131,11 @@ def custom_serializer(x): for vendor in vendors: logger.info("Starting to collect data from vendor: " + vendor.id) vendor.get_all() - session.add(vendor) + # check if vendor is already present in the database and add or merge + if session.exec(select(Vendor).where(id == vendor.id)).all(): + session.merge(vendor) + else: + session.add(vendor) session.commit() From 0a939da3c8518d01bb1193924eff78b1f001aab0 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 21:34:28 +0100 Subject: [PATCH 17/33] rename inserted_at to observed_at and update TS on update --- src/sc_crawler/schemas.py | 17 ++++++++++++----- src/sc_crawler/utils.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 7cd34500..db5a8f83 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -32,7 +32,7 @@ class ScMetaModel(SQLModel.__class__): reuse the optional table and field descriptions. Table docstrings are truncated to first line. - - Append inserted_at column. + - Append observed_at column. """ def __init__(subclass, *args, **kwargs): @@ -49,8 +49,15 @@ def __init__(subclass, *args, **kwargs): comment = satable.columns[k].comment if v.description and comment is None: satable.columns[k].comment = v.description - # append inserted_at as last column - satable.append_column(Column("inserted_at", DateTime, default=datetime.utcnow)) + # append observed_at as last column + satable.append_column( + Column( + "observed_at", + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + ) + ) class ScModel(SQLModel, metaclass=ScMetaModel): @@ -60,7 +67,7 @@ class ScModel(SQLModel, metaclass=ScMetaModel): - auto-generated table names using snake_case, - support for hashing table rows, - reuse description field of tables/columns as SQL comment, - - automatically append inserted_at column. + - automatically append observed_at column. """ @declared_attr # type: ignore @@ -74,7 +81,7 @@ def get_table_name(cls) -> str: return str(cls.__tablename__) @classmethod - def hash(cls, session, ignored: List[str] = ["inserted_at"]) -> dict: + def hash(cls, session, ignored: List[str] = ["observed_at"]) -> dict: pks = sorted([key.name for key in inspect(cls).primary_key]) rows = session.exec(statement=select(cls)) # no use of a generator as will need to serialize to JSON anyway diff --git a/src/sc_crawler/utils.py b/src/sc_crawler/utils.py index 9949a0fd..395960eb 100644 --- a/src/sc_crawler/utils.py +++ b/src/sc_crawler/utils.py @@ -33,7 +33,7 @@ class HashLevels(Enum): def hash_database( connection_string: str, level: HashLevels = HashLevels.DATABASE, - ignored: List[str] = ["inserted_at"], + ignored: List[str] = ["observed_at"], ) -> Union[str, dict]: """Hash the content of a database. From 07b6700dcce5034f5eea765a2f99c6e7c810e07f Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:02:06 +0100 Subject: [PATCH 18/33] _get_methods instead of private attr as that latter is not always initialized --- src/sc_crawler/schemas.py | 83 +++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index db5a8f83..1454b02d 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -194,7 +194,8 @@ class Vendor(ScModel, table=True): status: Status = Status.ACTIVE # private attributes - _methods: ImportString[ModuleType] = PrivateAttr() + _methods: Optional[ImportString[ModuleType]] = PrivateAttr(default=None) + _session: Optional[Session] = PrivateAttr() # relations country: Country = Relationship(back_populates="vendors") @@ -210,56 +211,86 @@ class Vendor(ScModel, table=True): def __init__(self, **kwargs): super().__init__(**kwargs) + # SQLModel does not validates pydantic typing, + # only when writing to DB (much later in the process) + if not self.id: + raise ValueError("No vendor id provided") + if not self.name: + raise ValueError("No vendor name provided") + if not self.homepage: + raise ValueError("No vendor homepage provided") + if not self.country: + raise ValueError("No vendor country provided") + # make sure methods are provided + methods = self._get_methods().__dir__() + for method in [ + "get_compliance_frameworks", + "get_datacenters", + "get_zones", + "get_servers", + "get_server_prices", + "get_server_prices_spot", + "get_storage_prices", + "get_traffic_prices", + "get_ipv4_prices", + ]: + if method not in methods: + raise NotImplementedError( + f"Unsupported '{self.id}' vendor: missing '{method}' method." + ) + + def _get_methods(self): + # private attributes are not (always) initialized correctly by SQLmodel + # e.g. the attribute is missing alltogether when loaded from DB + # https://github.com/tiangolo/sqlmodel/issues/149 try: - # SQLModel does not validates pydantic typing, - # only when writing to DB (much later in the process) - if not self.id: - raise ValueError("No vendor id provided") - if not self.name: - raise ValueError("No vendor name provided") - if not self.homepage: - raise ValueError("No vendor homepage provided") - if not self.country: - raise ValueError("No vendor country provided") - vendor_module = __name__.split(".")[0] + ".vendors." + self.id - self._methods = import_module(vendor_module) - except ValueError as exc: - raise exc - except Exception as exc: - raise NotImplementedError("Unsupported vendor") from exc + hasattr(self, "_methods") + except Exception: + self._methods = None + if not self._methods: + try: + vendor_module = ".".join( + [__name__.split(".", maxsplit=1)[0], "vendors", self.id] + ) + self._methods = import_module(vendor_module) + except Exception as exc: + raise NotImplementedError( + f"Unsupported '{self.id}' vendor: no methods defined." + ) from exc + return self._methods def get_compliance_frameworks(self): """Get the vendor's all compliance frameworks.""" - self._methods.get_compliance_frameworks(self) + self._get_methods().get_compliance_frameworks(self) def get_datacenters(self): """Get the vendor's all datacenters.""" - self._methods.get_datacenters(self) + self._get_methods().get_datacenters(self) def get_zones(self): """Get all the zones in the vendor's datacenters.""" - self._methods.get_zones(self) + self._get_methods().get_zones(self) def get_servers(self): """Get the vendor's all server types.""" - self._methods.get_servers(self) + self._get_methods().get_servers(self) def get_server_prices(self): """Get the current standard/ondemand/reserved prices of all server types.""" - self._methods.get_server_prices(self) + self._get_methods().get_server_prices(self) def get_server_prices_spot(self): """Get the current spot prices of all server types.""" - self._methods.get_server_prices_spot(self) + self._get_methods().get_server_prices_spot(self) def get_storage_prices(self): - self._methods.get_storage_prices(self) + self._get_methods().get_storage_prices(self) def get_traffic_prices(self): - self._methods.get_traffic_prices(self) + self._get_methods().get_traffic_prices(self) def get_ipv4_prices(self): - self._methods.get_ipv4_prices(self) + self._get_methods().get_ipv4_prices(self) def get_prices(self): self.get_server_prices() From 10ba4bd43d3b9d6ce8c50c3979679b95ee1679cf Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:24:06 +0100 Subject: [PATCH 19/33] log start/end of getter fns --- src/sc_crawler/logger.py | 12 ++++++++++++ src/sc_crawler/schemas.py | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/sc_crawler/logger.py b/src/sc_crawler/logger.py index 853127ca..5d08acf8 100644 --- a/src/sc_crawler/logger.py +++ b/src/sc_crawler/logger.py @@ -2,3 +2,15 @@ logger = logging.getLogger("sc_crawler") logger.addHandler(logging.NullHandler()) + + +def log_start_end(func): + """Log the start and end of the decorated function.""" + + def wrap(*args, **kwargs): + logger.debug(f"Starting {func.__name__}") + result = func(*args, **kwargs) + logger.debug(f"Finished {func.__name__}") + return result + + return wrap diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 1454b02d..b8956113 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -19,6 +19,7 @@ from sqlalchemy.orm import declared_attr from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select +from .logger import logger, log_start_end from .str import snake_case @@ -259,36 +260,45 @@ def _get_methods(self): ) from exc return self._methods + @log_start_end def get_compliance_frameworks(self): """Get the vendor's all compliance frameworks.""" self._get_methods().get_compliance_frameworks(self) + @log_start_end def get_datacenters(self): """Get the vendor's all datacenters.""" self._get_methods().get_datacenters(self) + @log_start_end def get_zones(self): """Get all the zones in the vendor's datacenters.""" self._get_methods().get_zones(self) + @log_start_end def get_servers(self): """Get the vendor's all server types.""" self._get_methods().get_servers(self) + @log_start_end def get_server_prices(self): """Get the current standard/ondemand/reserved prices of all server types.""" self._get_methods().get_server_prices(self) + @log_start_end def get_server_prices_spot(self): """Get the current spot prices of all server types.""" self._get_methods().get_server_prices_spot(self) + @log_start_end def get_storage_prices(self): self._get_methods().get_storage_prices(self) + @log_start_end def get_traffic_prices(self): self._get_methods().get_traffic_prices(self) + @log_start_end def get_ipv4_prices(self): self._get_methods().get_ipv4_prices(self) From 05b1919effc084c5cf9a008938a1a1bc662a4e2e Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:25:45 +0100 Subject: [PATCH 20/33] auto-merge all ScModel instances to the database --- src/sc_crawler/schemas.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index b8956113..50ed2dbf 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -17,7 +17,7 @@ from sqlalchemy import DateTime from sqlalchemy.inspection import inspect from sqlalchemy.orm import declared_attr -from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select +from sqlmodel import JSON, Column, Field, Relationship, Session, SQLModel, select from .logger import logger, log_start_end from .str import snake_case @@ -98,6 +98,17 @@ def hash(cls, session, ignored: List[str] = ["observed_at"]) -> dict: hashes[rowkeys] = rowhash return hashes + def __init__(self, *args, **kwargs): + """Merge instace with the database if present. + + Checking if there's a parent vendor, and then try to sync the + object using the parent's session private attribute. + """ + super().__init__(*args, **kwargs) + if hasattr(self, "vendor"): + if hasattr(self.vendor, "_session"): + self.vendor.merge_dependent(self) + class Json(BaseModel): """Custom base SQLModel class that supports dumping as JSON.""" @@ -316,6 +327,13 @@ def get_all(self): self.get_servers() self.get_prices() + def set_session(self, session): + self._session = session + + def merge_dependent(self, obj): + if self._session: + self._session.merge(obj) + class Datacenter(ScModel, table=True): id: str = Field(primary_key=True) From 79c06c94e51cc596b9c4096891c05656279406a9 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:38:27 +0100 Subject: [PATCH 21/33] log vendor name when available --- src/sc_crawler/logger.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/logger.py b/src/sc_crawler/logger.py index 5d08acf8..236abe76 100644 --- a/src/sc_crawler/logger.py +++ b/src/sc_crawler/logger.py @@ -8,9 +8,14 @@ def log_start_end(func): """Log the start and end of the decorated function.""" def wrap(*args, **kwargs): - logger.debug(f"Starting {func.__name__}") + try: + self = args[0] + fname = f"{self.id}/{func.__name__}" + except Exception: + fname = func.__name__ + logger.debug("Starting %s", fname) result = func(*args, **kwargs) - logger.debug(f"Finished {func.__name__}") + logger.debug("Finished %s", fname) return result return wrap From a26cb6e728b531db4f2e07a857cbca534667d137 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:39:46 +0100 Subject: [PATCH 22/33] simplify logic: do not return --- src/sc_crawler/vendors/aws.py | 55 ++++++++++------------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 1ff265d0..0c8116ef 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -1,6 +1,5 @@ import json import re -from collections import ChainMap from datetime import datetime, timedelta from itertools import chain @@ -355,9 +354,10 @@ def _make_price_from_product(product, vendor): def get_compliance_frameworks(vendor): - for cf in ["hipaa", "soc2t2"]: + for compliance_framework in ["hipaa", "soc2t2"]: VendorComplianceLink( - vendor=vendor, compliance_framework=compliance_frameworks[cf] + vendor=vendor, + compliance_framework_id=compliance_framework, ) @@ -676,49 +676,26 @@ def get_datacenters(vendor): if datacenter.id not in active_regions: datacenter.status = "inactive" - # filter for datacenters enabled for the account - datacenters = [ - datacenter - for datacenter in datacenters - if datacenter.id in [region["RegionName"] for region in regions] - ] - - # TODO do we really need to return enything? standardize! - return datacenters - def get_zones(vendor): """List all available AWS availability zones.""" - zones = [ - [ - Zone( - id=zone["ZoneId"], - name=zone["ZoneName"], - datacenter=datacenter, - vendor=vendor, - ) - for zone in _boto_describe_availability_zones(datacenter.id) - ] - for datacenter in vendor.datacenters - if datacenter.status == "active" - ] - # TODO check if zone is active - return ChainMap(*zones) + for datacenter in vendor.datacenters: + if datacenter.status == "active": + for zone in _boto_describe_availability_zones(datacenter.id): + Zone( + id=zone["ZoneId"], + name=zone["ZoneName"], + datacenter=datacenter, + vendor=vendor, + ) def get_servers(vendor): # TODO drop this in favor of pricing.get_products, as it has info e.g. on instanceFamily # although other fields are messier (e.g. extract memory from string) - regions = [ - datacenter.id - for datacenter in vendor.datacenters - if datacenter.status == "active" - ] - # might be instance types specific to a few or even a single region - instance_types = [ - _list_instance_types_of_region(region, vendor) for region in regions - ] - return list(chain(*instance_types)) + for datacenter in vendor.datacenters: + if datacenter.status == "active": + _list_instance_types_of_region(datacenter.id, vendor) def get_server_prices(vendor): @@ -771,10 +748,10 @@ def get_ipv4_prices(vendor): price = _extract_ondemand_price(product["terms"]) Ipv4Price( vendor=vendor, + datacenter=datacenter, price=price[0], currency=price[1], unit=PriceUnit.HOUR, - datacenter=datacenter, ) From 49beed1d503d0804df6c534dd17822cb1378fa09 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:39:59 +0100 Subject: [PATCH 23/33] need to merge after status change --- src/sc_crawler/vendors/aws.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 0c8116ef..c3e6906e 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -675,6 +675,8 @@ def get_datacenters(vendor): for datacenter in datacenters: if datacenter.id not in active_regions: datacenter.status = "inactive" + # note the change of status in the session + datacenter.vendor.merge_dependent(datacenter) def get_zones(vendor): From 6f17e5ea68022e7fffd6227691627bdc12b19835 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:43:30 +0100 Subject: [PATCH 24/33] provide all required getters as dummy fns --- src/sc_crawler/vendors/gcp.py | 36 +++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/vendors/gcp.py b/src/sc_crawler/vendors/gcp.py index 8b96d270..2341b4fd 100644 --- a/src/sc_crawler/vendors/gcp.py +++ b/src/sc_crawler/vendors/gcp.py @@ -1,2 +1,34 @@ -def get_instance_types(*args, **kwargs): - return [] +def get_compliance_frameworks(vendor): + pass + + +def get_datacenters(vendor): + pass + + +def get_zones(vendor): + pass + + +def get_servers(vendor): + pass + + +def get_server_prices(vendor): + pass + + +def get_server_prices_spot(vendor): + pass + + +def get_storage_prices(vendor): + pass + + +def get_traffic_prices(vendor): + pass + + +def get_ipv4_prices(vendor): + pass From 7b48e8af8a4a1ca1b487bc67c4c64c4e7228b429 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:43:49 +0100 Subject: [PATCH 25/33] note use id instead of relationship --- src/sc_crawler/vendors/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index f0a92d52..a085db79 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -5,7 +5,7 @@ For example, `aws.py` provides functions to be used by its `Vendor` instance, ca Each file should provide the below functions: -- `get_compliance_frameworks`: Define `VendorComplianceLink` instances to describe which frameworks the vendor complies with. Optionally include references in the `comment` field. +- `get_compliance_frameworks`: Define `VendorComplianceLink` instances to describe which frameworks the vendor complies with. Optionally include references in the `comment` field. To avoid duplicating `ComplianceFramework` instances, easiest is to use the `compliance_framework_id` field instead of the `compliance_framework` relationship. - `get_datacenters`: Define `Datacenter` instances with location, energy source etc for each region/datacenter the vendor has. - `get_zones`: Define a `Zone` instance for each availability zone of the vendor in each datacenter. - `get_servers`: Define `Server` instances for the vendor's server/instance types. From 324237d835f3834cf04ff2147314658df96d3896 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:47:29 +0100 Subject: [PATCH 26/33] add/merge static objects to database --- src/sc_crawler/cli.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index 43b5dffa..68732d52 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -11,7 +11,8 @@ from . import vendors as vendors_module from .logger import logger -from .schemas import Vendor +from .lookup import compliance_frameworks, countries +from .schemas import ComplianceFramework, Country, Vendor from .utils import hash_database supported_vendors = [ @@ -128,6 +129,13 @@ def custom_serializer(x): engine = create_engine(connection_string, json_serializer=custom_serializer) SQLModel.metadata.create_all(engine) with Session(engine) as session: + # add/merge static objects to database + for compliance_framework in compliance_frameworks.values(): + session.merge(compliance_framework) + for country in countries.values(): + session.merge(country) + + # get data for each vendor and then add/merge to database for vendor in vendors: logger.info("Starting to collect data from vendor: " + vendor.id) vendor.get_all() From 2872fae25d218b30ef6dccec5ba1df64c915a6ae Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Wed, 21 Feb 2024 23:48:08 +0100 Subject: [PATCH 27/33] merge vendor --- src/sc_crawler/cli.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index 68732d52..39a701ae 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -134,16 +134,13 @@ def custom_serializer(x): session.merge(compliance_framework) for country in countries.values(): session.merge(country) - # get data for each vendor and then add/merge to database for vendor in vendors: logger.info("Starting to collect data from vendor: " + vendor.id) + vendor = session.merge(vendor) + vendor.set_session(session) vendor.get_all() - # check if vendor is already present in the database and add or merge - if session.exec(select(Vendor).where(id == vendor.id)).all(): - session.merge(vendor) - else: - session.add(vendor) + session.merge(vendor) session.commit() From 721b459c8f1af2100e52c80387927efb7bbd5567 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Thu, 22 Feb 2024 00:07:02 +0100 Subject: [PATCH 28/33] optionally filter which table is to be updated --- src/sc_crawler/cli.py | 30 +++++++++++++++++++++++++++--- src/sc_crawler/schemas.py | 14 -------------- src/sc_crawler/vendors/README.md | 2 -- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index 39a701ae..fa1c2237 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -6,13 +6,13 @@ import typer from cachier import set_default_params -from sqlmodel import Session, SQLModel, create_engine, select +from sqlmodel import Session, SQLModel, create_engine from typing_extensions import Annotated from . import vendors as vendors_module from .logger import logger from .lookup import compliance_frameworks, countries -from .schemas import ComplianceFramework, Country, Vendor +from .schemas import Vendor from .utils import hash_database supported_vendors = [ @@ -37,6 +37,9 @@ log_levels = list(logging._nameToLevel.keys()) LogLevels = Enum("LOGLEVELS", {k: k for k in log_levels}) +supported_tables = [m[4:] for m in dir(Vendor) if m.startswith("get_")] +Tables = Enum("TABLES", {k: k for k in supported_tables}) + @cli.command() def schema(dialect: Engines): @@ -77,6 +80,10 @@ def pull( List[Vendors], typer.Option(help="Exclude specific vendor. Can be specified multiple times."), ] = [], + update_table: Annotated[ + List[Tables], + typer.Option(help="Tables to be updated. Can be specified multiple times."), + ] = supported_tables, log_level: Annotated[ LogLevels, typer.Option(help="Log level threshold.") ] = LogLevels.INFO.value, # TODO drop .value after updating Enum to StrEnum in Python3.11 @@ -139,7 +146,24 @@ def custom_serializer(x): logger.info("Starting to collect data from vendor: " + vendor.id) vendor = session.merge(vendor) vendor.set_session(session) - vendor.get_all() + if Tables.compliance_frameworks in update_table: + vendor.get_compliance_frameworks() + if Tables.datacenters in update_table: + vendor.get_datacenters() + if Tables.zones in update_table: + vendor.get_zones() + if Tables.servers in update_table: + vendor.get_servers() + if Tables.server_prices in update_table: + vendor.get_server_prices() + if Tables.server_prices_spot in update_table: + vendor.get_server_prices_spot() + if Tables.storage_prices in update_table: + vendor.get_storage_prices() + if Tables.traffic_prices in update_table: + vendor.get_traffic_prices() + if Tables.ipv4_prices in update_table: + vendor.get_ipv4_prices() session.merge(vendor) session.commit() diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 50ed2dbf..fde249e5 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -313,20 +313,6 @@ def get_traffic_prices(self): def get_ipv4_prices(self): self._get_methods().get_ipv4_prices(self) - def get_prices(self): - self.get_server_prices() - self.get_server_prices_spot() - self.get_storage_prices() - self.get_traffic_prices() - self.get_ipv4_prices() - - def get_all(self): - self.get_compliance_frameworks() - self.get_datacenters() - self.get_zones() - self.get_servers() - self.get_prices() - def set_session(self, session): self._session = session diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index a085db79..4ad53809 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -17,8 +17,6 @@ Each file should provide the below functions: Each function will be picked up as the related `Vendor` instance's instance methods, so each function should take a single argument, that is the `Vendor` instance. No need to return the objects -- it's enough to define the above-mentioned instances. -There are also a `get_prices` and `get_all` instance method defined for each `Vendor`, which wrappers call the pricing-related or all the above helpers in the above-listed order. - If a helper is not needed (e.g. another helper already provides its output, or there are no spot prices), it is still required, but can return early, e.g. if `Zone` objects were populated by `get_datacenters` already, do something like: ```python From c8b142f14f12ddded7a3b78fa7af8e911d426d69 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Thu, 22 Feb 2024 00:08:34 +0100 Subject: [PATCH 29/33] drop unused imports --- src/sc_crawler/schemas.py | 2 +- src/sc_crawler/vendors/aws.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index fde249e5..2f706ae5 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -19,7 +19,7 @@ from sqlalchemy.orm import declared_attr from sqlmodel import JSON, Column, Field, Relationship, Session, SQLModel, select -from .logger import logger, log_start_end +from .logger import log_start_end from .str import snake_case diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index c3e6906e..a74d674a 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -7,7 +7,7 @@ from cachier import cachier, set_default_params from ..logger import logger -from ..lookup import compliance_frameworks, countries +from ..lookup import countries from ..schemas import ( Datacenter, Disk, From 98439f9bb971499e16a1344bc554f8dd544d25ce Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Thu, 22 Feb 2024 22:23:23 +0100 Subject: [PATCH 30/33] make sure all fetched records have STATUS and reset to INACTIVE before pull --- src/sc_crawler/schemas.py | 74 +++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 2f706ae5..ad56d8a0 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -14,7 +14,7 @@ ImportString, PrivateAttr, ) -from sqlalchemy import DateTime +from sqlalchemy import DateTime, update from sqlalchemy.inspection import inspect from sqlalchemy.orm import declared_attr from sqlmodel import JSON, Column, Field, Relationship, Session, SQLModel, select @@ -110,6 +110,18 @@ def __init__(self, *args, **kwargs): self.vendor.merge_dependent(self) +class Status(str, Enum): + ACTIVE = "active" + INACTIVE = "inactive" + + +class HasStatus(ScModel): + status: Status = Field( + default=Status.ACTIVE, + description="Status of the resource (active or inactive).", + ) + + class Json(BaseModel): """Custom base SQLModel class that supports dumping as JSON.""" @@ -117,11 +129,6 @@ def __json__(self): return self.model_dump() -class Status(str, Enum): - ACTIVE = "active" - INACTIVE = "inactive" - - class Country(ScModel, table=True): """Country and continent mapping.""" @@ -136,13 +143,15 @@ class Country(ScModel, table=True): datacenters: List["Datacenter"] = Relationship(back_populates="country") -class VendorComplianceLink(ScModel, table=True): +class VendorComplianceLinkBase(ScModel): vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) compliance_framework_id: str = Field( foreign_key="compliance_framework.id", primary_key=True ) comment: Optional[str] = None + +class VendorComplianceLink(HasStatus, VendorComplianceLinkBase, table=True): vendor: "Vendor" = Relationship(back_populates="compliance_framework_links") compliance_framework: "ComplianceFramework" = Relationship( back_populates="vendor_links" @@ -271,55 +280,85 @@ def _get_methods(self): ) from exc return self._methods + def set_session(self, session): + """Attach a SQLModel session to use for merging dependent objects into the database.""" + self._session = session + + def merge_dependent(self, obj): + """Merge an object into the Vendor's SQLModel session (when available).""" + if self._session: + self._session.merge(obj) + + def set_table_rows_inactive(self, model: str, *args) -> None: + """Set this vendor's records to INACTIVE in a table + + Positional arguments can be used to pass further filters + (besides the default model.vendor_id filter) referencing the + model object with SQLModel syntax, e.g. + + >>> aws.set_table_rows_inactive(ServerPrice, ServerPrice.price < 10) + """ + if self._session: + query = update(model).where(model.vendor_id == self.id) + for arg in args: + query = query.where(arg) + self._session.execute(query.values(status=Status.INACTIVE)) + @log_start_end def get_compliance_frameworks(self): """Get the vendor's all compliance frameworks.""" + self.set_table_rows_inactive(VendorComplianceLink) self._get_methods().get_compliance_frameworks(self) @log_start_end def get_datacenters(self): """Get the vendor's all datacenters.""" + self.set_table_rows_inactive(Datacenter) self._get_methods().get_datacenters(self) @log_start_end def get_zones(self): """Get all the zones in the vendor's datacenters.""" + self.set_table_rows_inactive(Zone) self._get_methods().get_zones(self) @log_start_end def get_servers(self): """Get the vendor's all server types.""" + self.set_table_rows_inactive(Server) self._get_methods().get_servers(self) @log_start_end def get_server_prices(self): """Get the current standard/ondemand/reserved prices of all server types.""" + self.set_table_rows_inactive( + ServerPrice, ServerPrice.allocation != Allocation.SPOT + ) self._get_methods().get_server_prices(self) @log_start_end def get_server_prices_spot(self): """Get the current spot prices of all server types.""" + self.set_table_rows_inactive( + ServerPrice, ServerPrice.allocation == Allocation.SPOT + ) self._get_methods().get_server_prices_spot(self) @log_start_end def get_storage_prices(self): + self.set_table_rows_inactive(StoragePrice) self._get_methods().get_storage_prices(self) @log_start_end def get_traffic_prices(self): + self.set_table_rows_inactive(TrafficPrice) self._get_methods().get_traffic_prices(self) @log_start_end def get_ipv4_prices(self): + self.set_table_rows_inactive(Ipv4Price) self._get_methods().get_ipv4_prices(self) - def set_session(self, session): - self._session = session - - def merge_dependent(self, obj): - if self._session: - self._session.merge(obj) - class Datacenter(ScModel, table=True): id: str = Field(primary_key=True) @@ -547,6 +586,7 @@ class PriceTier(Json): # helper classes to inherit for most commonly used fields +# TODO rewrite above classes using helper classes as well class HasVendorPK(ScModel): @@ -573,7 +613,7 @@ class HasTraffic(ScModel): traffic_id: str = Field(foreign_key="traffic.id", primary_key=True) -class HasPriceFields(ScModel): +class HasPriceFieldsBase(ScModel): unit: PriceUnit # set to max price if tiered price: float @@ -584,6 +624,10 @@ class HasPriceFields(ScModel): currency: str = "USD" +class HasPriceFields(HasStatus, HasPriceFieldsBase): + pass + + class ServerPriceExtraFields(ScModel): operating_system: str allocation: Allocation = Allocation.ONDEMAND From f241d75abb9cc4759b7e9b9ec6384cbad1f87839 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Thu, 22 Feb 2024 22:32:13 +0100 Subject: [PATCH 31/33] skip doctest without session --- src/sc_crawler/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index ad56d8a0..aa4507c3 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -296,7 +296,7 @@ def set_table_rows_inactive(self, model: str, *args) -> None: (besides the default model.vendor_id filter) referencing the model object with SQLModel syntax, e.g. - >>> aws.set_table_rows_inactive(ServerPrice, ServerPrice.price < 10) + >>> aws.set_table_rows_inactive(ServerPrice, ServerPrice.price < 10) # doctest: +SKIP """ if self._session: query = update(model).where(model.vendor_id == self.id) From 9a58792b75d916ed62fd178792feb03f4e67fee7 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Fri, 23 Feb 2024 10:09:29 +0100 Subject: [PATCH 32/33] note SAWarning for future investigation --- src/sc_crawler/schemas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index aa4507c3..d3ca8fe4 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -287,6 +287,8 @@ def set_session(self, session): def merge_dependent(self, obj): """Merge an object into the Vendor's SQLModel session (when available).""" if self._session: + # TODO investigate SAWarning + # on obj associated with vendor before added to session? self._session.merge(obj) def set_table_rows_inactive(self, model: str, *args) -> None: From 8a2efa9d86b72a2a733db1b9d01a045785dbff4a Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Fri, 23 Feb 2024 10:13:41 +0100 Subject: [PATCH 33/33] get prefix -> inventory to reduce chance of conflict with other helpers --- README.md | 33 ------------------- src/sc_crawler/cli.py | 20 ++++++------ src/sc_crawler/schemas.py | 54 ++++++++++++++++---------------- src/sc_crawler/vendors/README.md | 42 ++++++++++++------------- src/sc_crawler/vendors/aws.py | 18 +++++------ src/sc_crawler/vendors/gcp.py | 18 +++++------ 6 files changed, 76 insertions(+), 109 deletions(-) diff --git a/README.md b/README.md index 7839b514..803679d0 100644 --- a/README.md +++ b/README.md @@ -92,36 +92,3 @@ server = session.exec(select(Server).where(Server.id == 'trn1.32xlarge')).one() pp(server) pp(server.vendor) ``` - -Lower level access examples: - -```py -from sc_crawler.vendors import aws - -# enable persistent caching of AWS queries -from cachier import set_default_params -set_default_params(caching_enabled=True) - -# fetch data -aws.get_all() # slow to query all instance types in all regions - -# look around -aws.datacenters -aws.zones - -# pretty printed objects -from rich import print as pp -pp(aws) -pp(aws.datacenters) -pp(aws.servers[0]) -``` - -Debug raw AWS responses: - -```py -products = aws._methods.get_products() -pp(products[1]["product"]) - -instance_types = aws._methods.describe_instance_types(region="us-west-2") -pp(instance_types[1]) -``` diff --git a/src/sc_crawler/cli.py b/src/sc_crawler/cli.py index fa1c2237..3c685f46 100644 --- a/src/sc_crawler/cli.py +++ b/src/sc_crawler/cli.py @@ -37,7 +37,7 @@ log_levels = list(logging._nameToLevel.keys()) LogLevels = Enum("LOGLEVELS", {k: k for k in log_levels}) -supported_tables = [m[4:] for m in dir(Vendor) if m.startswith("get_")] +supported_tables = [m[10:] for m in dir(Vendor) if m.startswith("inventory_")] Tables = Enum("TABLES", {k: k for k in supported_tables}) @@ -147,23 +147,23 @@ def custom_serializer(x): vendor = session.merge(vendor) vendor.set_session(session) if Tables.compliance_frameworks in update_table: - vendor.get_compliance_frameworks() + vendor.inventory_compliance_frameworks() if Tables.datacenters in update_table: - vendor.get_datacenters() + vendor.inventory_datacenters() if Tables.zones in update_table: - vendor.get_zones() + vendor.inventory_zones() if Tables.servers in update_table: - vendor.get_servers() + vendor.inventory_servers() if Tables.server_prices in update_table: - vendor.get_server_prices() + vendor.inventory_server_prices() if Tables.server_prices_spot in update_table: - vendor.get_server_prices_spot() + vendor.inventory_server_prices_spot() if Tables.storage_prices in update_table: - vendor.get_storage_prices() + vendor.inventory_storage_prices() if Tables.traffic_prices in update_table: - vendor.get_traffic_prices() + vendor.inventory_traffic_prices() if Tables.ipv4_prices in update_table: - vendor.get_ipv4_prices() + vendor.inventory_ipv4_prices() session.merge(vendor) session.commit() diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index d3ca8fe4..7452e3aa 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -245,15 +245,15 @@ def __init__(self, **kwargs): # make sure methods are provided methods = self._get_methods().__dir__() for method in [ - "get_compliance_frameworks", - "get_datacenters", - "get_zones", - "get_servers", - "get_server_prices", - "get_server_prices_spot", - "get_storage_prices", - "get_traffic_prices", - "get_ipv4_prices", + "inventory_compliance_frameworks", + "inventory_datacenters", + "inventory_zones", + "inventory_servers", + "inventory_server_prices", + "inventory_server_prices_spot", + "inventory_storage_prices", + "inventory_traffic_prices", + "inventory_ipv4_prices", ]: if method not in methods: raise NotImplementedError( @@ -307,59 +307,59 @@ def set_table_rows_inactive(self, model: str, *args) -> None: self._session.execute(query.values(status=Status.INACTIVE)) @log_start_end - def get_compliance_frameworks(self): + def inventory_compliance_frameworks(self): """Get the vendor's all compliance frameworks.""" self.set_table_rows_inactive(VendorComplianceLink) - self._get_methods().get_compliance_frameworks(self) + self._get_methods().inventory_compliance_frameworks(self) @log_start_end - def get_datacenters(self): + def inventory_datacenters(self): """Get the vendor's all datacenters.""" self.set_table_rows_inactive(Datacenter) - self._get_methods().get_datacenters(self) + self._get_methods().inventory_datacenters(self) @log_start_end - def get_zones(self): + def inventory_zones(self): """Get all the zones in the vendor's datacenters.""" self.set_table_rows_inactive(Zone) - self._get_methods().get_zones(self) + self._get_methods().inventory_zones(self) @log_start_end - def get_servers(self): + def inventory_servers(self): """Get the vendor's all server types.""" self.set_table_rows_inactive(Server) - self._get_methods().get_servers(self) + self._get_methods().inventory_servers(self) @log_start_end - def get_server_prices(self): + def inventory_server_prices(self): """Get the current standard/ondemand/reserved prices of all server types.""" self.set_table_rows_inactive( ServerPrice, ServerPrice.allocation != Allocation.SPOT ) - self._get_methods().get_server_prices(self) + self._get_methods().inventory_server_prices(self) @log_start_end - def get_server_prices_spot(self): + def inventory_server_prices_spot(self): """Get the current spot prices of all server types.""" self.set_table_rows_inactive( ServerPrice, ServerPrice.allocation == Allocation.SPOT ) - self._get_methods().get_server_prices_spot(self) + self._get_methods().inventory_server_prices_spot(self) @log_start_end - def get_storage_prices(self): + def inventory_storage_prices(self): self.set_table_rows_inactive(StoragePrice) - self._get_methods().get_storage_prices(self) + self._get_methods().inventory_storage_prices(self) @log_start_end - def get_traffic_prices(self): + def inventory_traffic_prices(self): self.set_table_rows_inactive(TrafficPrice) - self._get_methods().get_traffic_prices(self) + self._get_methods().inventory_traffic_prices(self) @log_start_end - def get_ipv4_prices(self): + def inventory_ipv4_prices(self): self.set_table_rows_inactive(Ipv4Price) - self._get_methods().get_ipv4_prices(self) + self._get_methods().inventory_ipv4_prices(self) class Datacenter(ScModel, table=True): diff --git a/src/sc_crawler/vendors/README.md b/src/sc_crawler/vendors/README.md index 4ad53809..5dc2f518 100644 --- a/src/sc_crawler/vendors/README.md +++ b/src/sc_crawler/vendors/README.md @@ -5,23 +5,23 @@ For example, `aws.py` provides functions to be used by its `Vendor` instance, ca Each file should provide the below functions: -- `get_compliance_frameworks`: Define `VendorComplianceLink` instances to describe which frameworks the vendor complies with. Optionally include references in the `comment` field. To avoid duplicating `ComplianceFramework` instances, easiest is to use the `compliance_framework_id` field instead of the `compliance_framework` relationship. -- `get_datacenters`: Define `Datacenter` instances with location, energy source etc for each region/datacenter the vendor has. -- `get_zones`: Define a `Zone` instance for each availability zone of the vendor in each datacenter. -- `get_servers`: Define `Server` instances for the vendor's server/instance types. -- `get_server_prices`: Define the `ServerPrice` instances for the standard/ondemand (or optionally also for the reserved) pricing of the instance types per datacenter and zone. -- `get_server_prices_spot`: Similar to the above, define `ServerPrice` instances but the `allocation` field set to `Allocation.SPOT`. Very likely to see different spot prices per datacenter/zone. -- `get_storage_prices`: Define `StoragePrice` instances to describe the available storage options that can be attached to the servers. -- `get_traffic_prices`: Define `TrafficPrice` instances to describe the pricing of ingress/egress traffic. -- `get_ipv4_prices`: Define `Ipv4Price` instances on the price of an IPv4 address. +- `inventory_compliance_frameworks`: Define `VendorComplianceLink` instances to describe which frameworks the vendor complies with. Optionally include references in the `comment` field. To avoid duplicating `ComplianceFramework` instances, easiest is to use the `compliance_framework_id` field instead of the `compliance_framework` relationship. +- `inventory_datacenters`: Define `Datacenter` instances with location, energy source etc for each region/datacenter the vendor has. +- `inventory_zones`: Define a `Zone` instance for each availability zone of the vendor in each datacenter. +- `inventory_servers`: Define `Server` instances for the vendor's server/instance types. +- `inventory_server_prices`: Define the `ServerPrice` instances for the standard/ondemand (or optionally also for the reserved) pricing of the instance types per datacenter and zone. +- `inventory_server_prices_spot`: Similar to the above, define `ServerPrice` instances but the `allocation` field set to `Allocation.SPOT`. Very likely to see different spot prices per datacenter/zone. +- `inventory_storage_prices`: Define `StoragePrice` instances to describe the available storage options that can be attached to the servers. +- `inventory_traffic_prices`: Define `TrafficPrice` instances to describe the pricing of ingress/egress traffic. +- `inventory_ipv4_prices`: Define `Ipv4Price` instances on the price of an IPv4 address. Each function will be picked up as the related `Vendor` instance's instance methods, so each function should take a single argument, that is the `Vendor` instance. No need to return the objects -- it's enough to define the above-mentioned instances. -If a helper is not needed (e.g. another helper already provides its output, or there are no spot prices), it is still required, but can return early, e.g. if `Zone` objects were populated by `get_datacenters` already, do something like: +If a helper is not needed (e.g. another helper already provides its output, or there are no spot prices), it is still required, but can return early, e.g. if `Zone` objects were populated by `inventory_datacenters` already, do something like: ```python -def get_zones(self): - """Zones were already provided in get_datacenters.""" +def inventory_zones(self): + """Zones were already provided in inventory_datacenters.""" pass ``` @@ -43,38 +43,38 @@ from ..schemas import ( ) -def get_compliance_frameworks(vendor): +def inventory_compliance_frameworks(vendor): pass -def get_datacenters(vendor): +def inventory_datacenters(vendor): pass -def get_zones(vendor): +def inventory_zones(vendor): pass -def get_servers(vendor): +def inventory_servers(vendor): pass -def get_server_prices(vendor): +def inventory_server_prices(vendor): pass -def get_server_prices_spot(vendor): +def inventory_server_prices_spot(vendor): pass -def get_storage_prices(vendor): +def inventory_storage_prices(vendor): pass -def get_traffic_prices(vendor): +def inventory_traffic_prices(vendor): pass -def get_ipv4_prices(vendor): +def inventory_ipv4_prices(vendor): pass ``` diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index a74d674a..3c84f599 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -353,7 +353,7 @@ def _make_price_from_product(product, vendor): # Public methods to fetch data -def get_compliance_frameworks(vendor): +def inventory_compliance_frameworks(vendor): for compliance_framework in ["hipaa", "soc2t2"]: VendorComplianceLink( vendor=vendor, @@ -361,7 +361,7 @@ def get_compliance_frameworks(vendor): ) -def get_datacenters(vendor): +def inventory_datacenters(vendor): """List all available AWS datacenters. Some data sources are not available from APIs, and were collected manually: @@ -679,7 +679,7 @@ def get_datacenters(vendor): datacenter.vendor.merge_dependent(datacenter) -def get_zones(vendor): +def inventory_zones(vendor): """List all available AWS availability zones.""" for datacenter in vendor.datacenters: if datacenter.status == "active": @@ -692,7 +692,7 @@ def get_zones(vendor): ) -def get_servers(vendor): +def inventory_servers(vendor): # TODO drop this in favor of pricing.get_products, as it has info e.g. on instanceFamily # although other fields are messier (e.g. extract memory from string) for datacenter in vendor.datacenters: @@ -700,7 +700,7 @@ def get_servers(vendor): _list_instance_types_of_region(datacenter.id, vendor) -def get_server_prices(vendor): +def inventory_server_prices(vendor): products = _boto_get_products( service_code="AmazonEC2", filters={ @@ -721,19 +721,19 @@ def get_server_prices(vendor): _make_price_from_product(product, vendor) -def get_server_prices_spot(vendor): +def inventory_server_prices_spot(vendor): pass -def get_storage_prices(vendor): +def inventory_storage_prices(vendor): pass -def get_traffic_prices(vendor): +def inventory_traffic_prices(vendor): pass -def get_ipv4_prices(vendor): +def inventory_ipv4_prices(vendor): products = _boto_get_products( service_code="AmazonVPC", filters={ diff --git a/src/sc_crawler/vendors/gcp.py b/src/sc_crawler/vendors/gcp.py index 2341b4fd..34a5afb5 100644 --- a/src/sc_crawler/vendors/gcp.py +++ b/src/sc_crawler/vendors/gcp.py @@ -1,34 +1,34 @@ -def get_compliance_frameworks(vendor): +def inventory_compliance_frameworks(vendor): pass -def get_datacenters(vendor): +def inventory_datacenters(vendor): pass -def get_zones(vendor): +def inventory_zones(vendor): pass -def get_servers(vendor): +def inventory_servers(vendor): pass -def get_server_prices(vendor): +def inventory_server_prices(vendor): pass -def get_server_prices_spot(vendor): +def inventory_server_prices_spot(vendor): pass -def get_storage_prices(vendor): +def inventory_storage_prices(vendor): pass -def get_traffic_prices(vendor): +def inventory_traffic_prices(vendor): pass -def get_ipv4_prices(vendor): +def inventory_ipv4_prices(vendor): pass