Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
bra-fsn committed Dec 29, 2023
2 parents 1e49b9b + 61075c4 commit a499d9b
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 39 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,23 @@ Examples:

```py
from sc_crawler.vendors import aws

# enable persistent caching of AWS queries
from cachier import set_default_params
set_default_params(caching_enabled=True)

# fetch data
aws.get_all() # slow to query all instance types in all regions

# look around
aws.datacenters
aws.zones

# pretty printed objects
from rich import print as pp
pp(aws)
pp(aws._datacenters[1]._zones)
pp(aws._servers.get("t3a.2xlarge"))
pp(aws._servers.get("c5d.large"))
pp(aws._servers.get("i3en.12xlarge"))
pp(aws._servers.get("g4dn.metal"))
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ name = "sc-crawler"
version = "0.0.1"
requires-python = ">= 3.7"
dependencies = [
"cachier",
"pydantic",
"pydantic_extra_types",
"pycountry",
Expand Down
54 changes: 39 additions & 15 deletions sc_crawler/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,37 +138,61 @@ def vendor(self) -> Vendor:
return self.datacenter.vendor


resource_types = Literal["compute", "traffic", "storage"]


class Resource(BaseModel):
# vendor-specific resources (e.g. instance types) should be
# prefixed with the vendor id, e.g. "aws:m5.xlarge"
identifier: str
name: str
description: Optional[str]
kind: Literal["compute", "traffic", "storage"]
resource_type: resource_types
billable_unit: str # e.g. GB, GiB, TB, runtime hours


storage_types = Literal["hdd", "ssd", "nvme ssd", "network"]


class Storage(BaseModel):
size: int = 0 # GB
storage_type: storage_types


class NetworkStorage(Resource, Storage):
resource_type: resource_types = "storage"
storage_type: storage_types = "network"
max_iops: Optional[int] = None
max_throughput: Optional[int] = None # MiB/s
min_size: Optional[int] = None # GiB
max_size: Optional[int] = None # GiB
billable_unit: str = "GiB"


class Gpu(BaseModel):
manufacturer: str
name: str
memory: int # MiB
firmware: Optional[str] = None


class Server(Resource):
kind: str = "compute"
resource_type: resource_types = "compute"
vcpus: int
cores: int
memory: int
gpu_count: int = 0
gpu_memory: Optional[int] = None # MiB
gpu_name: Optional[str] = None
gpus: List[Gpu] = []
storage_size: int = 0 # GB
storage_type: Optional[Literal["ssd", "hdd"]]
storage_type: Optional[storage_types]
storages: List[Storage] = []
network_speed: Optional[str]


class Storage(Resource):
kind: str = "storage"
max_iops: Optional[int]
max_throughput: Optional[int] # MiB/s
min_size: Optional[int] # GiB
max_size: Optional[int] # GiB
billable_unit: str = "GiB"


class Traffic(Resource):
kind: str = "traffic"
resource_type: resource_types = "traffic"
direction: Literal["inbound", "outbound"]
billable_unit: str = "GB"

Expand All @@ -194,5 +218,5 @@ class ComplianceFramework(BaseModel):
homepage: Optional[HttpUrl] = None


Vendor.update_forward_refs()
Datacenter.update_forward_refs()
Vendor.model_rebuild()
Datacenter.model_rebuild()
135 changes: 112 additions & 23 deletions sc_crawler/vendors/aws.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,48 @@
import boto3
from functools import cache
from cachier import cachier, set_default_params
from datetime import timedelta
from itertools import chain
import logging
import re

from .. import Location
from ..schemas import Datacenter, Zone, Server
from ..schemas import Datacenter, Zone, Server, Storage, Gpu

logger = logging.getLogger(__name__)

# disable caching by default
set_default_params(caching_enabled=False)

# ##############################################################################
# AWS cached helpers


@cachier(stale_after=timedelta(days=3))
def describe_instance_types(region):
ec2 = boto3.client("ec2", region_name=region)
return ec2.describe_instance_types().get("InstanceTypes")


@cachier(stale_after=timedelta(days=3))
def describe_regions():
ec2 = boto3.client("ec2")
return ec2.describe_regions().get("Regions", [])


@cachier(stale_after=timedelta(days=3))
def describe_availability_zones(region):
ec2 = boto3.client("ec2", region_name=region)
zones = ec2.describe_availability_zones(
Filters=[
{"Name": "zone-type", "Values": ["availability-zone"]},
],
AllAvailabilityZones=True,
).get("AvailabilityZones")
return zones


# ##############################################################################


def get_datacenters(vendor, *args, **kwargs):
"""List all available AWS datacenters.
Expand Down Expand Up @@ -271,8 +306,7 @@ def get_datacenters(vendor, *args, **kwargs):

# look for undocumented (new) datacenters in AWS
supported_regions = [d.identifier for d in datacenters]
ec2 = boto3.client("ec2")
regions = ec2.describe_regions().get("Regions", [])
regions = describe_regions()
for region in regions:
region_name = region.get("RegionName")
if "gov" in region_name:
Expand All @@ -292,14 +326,7 @@ def get_datacenters(vendor, *args, **kwargs):

# add zones
for datacenter in datacenters:
# need to create a new clien in each AWS region
ec2 = boto3.client("ec2", region_name=datacenter.identifier)
zones = ec2.describe_availability_zones(
Filters=[
{"Name": "zone-type", "Values": ["availability-zone"]},
],
AllAvailabilityZones=True,
).get("AvailabilityZones")
zones = describe_availability_zones(datacenter.identifier)
datacenter._zones = {
zone.get("ZoneId"): Zone(
identifier=zone.get("ZoneId"),
Expand All @@ -312,12 +339,6 @@ def get_datacenters(vendor, *args, **kwargs):
return datacenters


@cache
def describe_instance_types(region):
ec2 = boto3.client("ec2", region_name=region)
return ec2.describe_instance_types().get("InstanceTypes")


instance_families = {
"c": "Compute optimized",
"d": "Dense storage",
Expand Down Expand Up @@ -379,16 +400,77 @@ def annotate_instance_type(instance_type_id):
return text


def get_storage(instance_type):
"""Get storage size and type (tupple) from instance details."""
def get_storage(instance_type, nvme=False):
"""Get overall storage size and type (tupple) from instance details."""
if "InstanceStorageInfo" not in instance_type:
return (0, None)
info = instance_type.get("InstanceStorageInfo")
storage_size = info.get("TotalSizeInGB", 0) * 1024 * 1024
storage_size = info.get("TotalSizeInGB", 0)
storage_type = info.get("Disks")[0].get("Type").lower()
if storage_type == "ssd" and info.get("NvmeSupport", False):
storage_type = "nvme ssd"
return (storage_size, storage_type)


def array_expand_by_count(array):
"""Expand an array with its items Count field."""
array = [[a] * a.get("Count") for a in array]
return list(chain(*array))


def get_storages(instance_type):
"""Get individual storages as an array."""
if "InstanceStorageInfo" not in instance_type:
return []
info = instance_type.get("InstanceStorageInfo")

def to_storage(disk, nvme=False):
kind = disk.get("Type").lower()
if kind == "ssd" and nvme:
kind = "nvme ssd"
return Storage(size=disk.get("SizeInGB"), storage_type=kind)

# replicate number of disks
disks = info.get("Disks")
disks = array_expand_by_count(disks)
return [to_storage(disk, nvme=info.get("NvmeSupport", False)) for disk in disks]


def get_gpu(instance_type):
"""Get overall GPU count, memory and manufacturer/name."""
if "GpuInfo" not in instance_type:
return (0, None, None)
info = instance_type.get("GpuInfo")
memory = info.get("TotalGpuMemoryInMiB", 0)

def mn(gpu):
return gpu.get("Manufacturer") + " " + gpu.get("Name")

# iterate over each GPU
count = sum([gpu.get("Count") for gpu in info.get("Gpus")])
names = ", ".join([mn(gpu) for gpu in info.get("Gpus")])
return (count, memory, names)


def get_gpus(instance_type):
"""Get individual GPUs as an array."""
if "GpuInfo" not in instance_type:
return []
info = instance_type.get("GpuInfo")

def to_gpu(gpu):
return Gpu(
manufacturer=gpu.get("Manufacturer"),
name=gpu.get("Name"),
memory=gpu.get("MemoryInfo").get("SizeInMiB"),
)

# replicate number of disks
gpus = info.get("Gpus")
gpus = array_expand_by_count(gpus)
return [to_gpu(gpu) for gpu in gpus]


def get_instance_types(vendor, *args, **kwargs):
if not hasattr(vendor, "_datacenters"):
raise AttributeError("Datacenters not defined, run get_datacenters()")
Expand All @@ -401,6 +483,8 @@ def get_instance_types(vendor, *args, **kwargs):
for instance_type in local_instance_types:
it = instance_type.get("InstanceType")
if it not in list(instance_types.keys()):
gpu_info = get_gpu(instance_type)
storage_info = get_storage(instance_type)
instance_types.update(
{
it: Server(
Expand All @@ -410,8 +494,13 @@ def get_instance_types(vendor, *args, **kwargs):
vcpus=instance_type.get("VCpuInfo").get("DefaultVCpus"),
cores=instance_type.get("VCpuInfo").get("DefaultCores"),
memory=instance_type.get("MemoryInfo").get("SizeInMiB"),
storage_size=get_storage(instance_type)[0],
storage_type=get_storage(instance_type)[1],
gpu_count=gpu_info[0],
gpu_memory=gpu_info[1],
gpu_name=gpu_info[2],
gpus=get_gpus(instance_type),
storage_size=storage_info[0],
storage_type=storage_info[1],
storages=get_storages(instance_type),
network_speed=instance_type.get("NetworkInfo").get(
"NetworkPerformance"
),
Expand Down

0 comments on commit a499d9b

Please sign in to comment.