From af2b346990758a8bf314f4ad2aa8f51861639498 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Sun, 18 Feb 2024 23:00:12 +0100 Subject: [PATCH 01/12] stadardize table names using snake_case --- src/sc_crawler/schemas.py | 23 ++++++++++++++++------- src/sc_crawler/str.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 src/sc_crawler/str.py diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index ea0014a2..3a669cea 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -15,14 +15,28 @@ model_validator, ) from sqlalchemy.inspection import inspect +from sqlalchemy.orm import declared_attr # TODO SQLModel does NOT actually do pydantic validations # https://github.com/tiangolo/sqlmodel/issues/52 from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select +from .str import snake_case + + class ScModel(SQLModel): - """Custom extension to SQLModel to support hashing tables.""" + """Custom extension to SQLModel. + + Extra features: + - support for hashing table rows, + - auto-generated table names using snake_case. + """ + + @declared_attr # type: ignore + def __tablename__(cls) -> str: + """Generate tables names using all-lowercase snake_case.""" + return snake_case(cls.__name__) @classmethod def get_table_name(cls) -> str: @@ -72,10 +86,9 @@ class Country(ScModel, table=True): class VendorComplianceLink(ScModel, table=True): - __tablename__: str = "vendor_compliance_link" # type: ignore vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) compliance_framework_id: str = Field( - foreign_key="complianceframework.id", primary_key=True + foreign_key="compliance_framework.id", primary_key=True ) comment: Optional[str] = None @@ -240,8 +253,6 @@ class StorageType(str, Enum): class AddonStorage(ScModel, table=True): - __tablename__: str = "addon_storage" # type: ignore - id: str = Field(primary_key=True) vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) name: str @@ -265,8 +276,6 @@ class TrafficDirection(str, Enum): class AddonTraffic(ScModel, table=True): - __tablename__: str = "addon_traffic" # type: ignore - id: str = Field(primary_key=True) vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) name: str diff --git a/src/sc_crawler/str.py b/src/sc_crawler/str.py new file mode 100644 index 00000000..375e760d --- /dev/null +++ b/src/sc_crawler/str.py @@ -0,0 +1,32 @@ +from re import search, sub + + +# https://www.w3resource.com/python-exercises/string/python-data-type-string-exercise-97.php +def snake_case(text): + """Convert CamelCase to snake_case. + + Examples: + >>> snake_case('DescriptionToComment') + 'description_to_comment' + """ + return "_".join(sub("([A-Z][a-z]+)", r" \1", text).split()).lower() + + +# https://www.tutorialspoint.com/python-program-to-convert-singular-to-plural +def plural(text): + """Super basic implementation of pluralizing an English word. + + Note that grammar exceptions are not handled, so better to use a + proper NLP method for real use-cases. + + Examples: + >>> plural('dog') + 'dogs' + >>> plural('boy') # :facepalm: + 'boies' + """ + if search("[sxz]$", text) or search("[^aeioudgkprt]h$", text): + return sub("$", "es", text) + if search("[aeiou]y$", text): + return sub("y$", "ies", text) + return text + "s" From 129e1eeb23a742356bebb5bc348bbc0609a63971 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Sun, 18 Feb 2024 23:06:03 +0100 Subject: [PATCH 02/12] return str --- src/sc_crawler/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 3a669cea..246edfbb 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -41,7 +41,7 @@ def __tablename__(cls) -> str: @classmethod def get_table_name(cls) -> str: """Return the SQLModel object's table name.""" - return cls.__tablename__ + return str(cls.__tablename__) @classmethod def hash(cls, session, ignored: List[str] = ["inserted_at"]) -> dict: From ed575d5550caef9918c5aeabe71878ae6ec2ad55 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 00:01:44 +0100 Subject: [PATCH 03/12] reuse pydantic descriptions for sqlalchemy comments --- src/sc_crawler/schemas.py | 118 ++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 246edfbb..09ed1bc0 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -24,13 +24,38 @@ from .str import snake_case +class ReuseDescriptions(SQLModel.__class__): + """Reuse description of the table and its fields as SQL comment. -class ScModel(SQLModel): - """Custom extension to SQLModel. + Checking if the table and its fields have explicit comment set to + be shown in the `CREATE TABLE` statements, and if not, reuse the + optional table and field descriptions. Table docstrings are + truncated to first line. + """ + + def __init__(subclass, *args, **kwargs): + super().__init__(*args, **kwargs) + # early return for non-tables + if subclass.model_config.get("table") is None: + return + # table comment + satable = subclass.metadata.tables[subclass.__tablename__] + if subclass.__doc__ and satable.comment is None: + satable.comment = subclass.__doc__.splitlines()[0] + # column comments + for k, v in subclass.__fields__.items(): + comment = satable.columns[k].comment + if v.description and comment is None: + satable.columns[k].comment = v.description + + +class ScModel(SQLModel, metaclass=ReuseDescriptions): + """Custom extensions to SQLModel objects and tables. Extra features: + - auto-generated table names using snake_case, - support for hashing table rows, - - auto-generated table names using snake_case. + - reuse description field of tables/columns as SQL comment. """ @declared_attr # type: ignore @@ -73,13 +98,14 @@ class Status(str, Enum): class Country(ScModel, table=True): - __table_args__ = {"comment": "Country and continent mapping."} + """Country and continent mapping.""" + id: str = Field( default=None, primary_key=True, - sa_column_kwargs={"comment": "Country code by ISO 3166 alpha-2."}, + description="Country code by ISO 3166 alpha-2.", ) - continent: str = Field(sa_column_kwargs={"comment": "Continent name."}) + continent: str = Field(description="Continent name.") vendors: List["Vendor"] = Relationship(back_populates="country") datacenters: List["Datacenter"] = Relationship(back_populates="country") @@ -115,7 +141,7 @@ class ComplianceFramework(ScModel, table=True): class Vendor(ScModel, table=True): - """Base class for cloud compute resource vendors. + """Compute resource vendors, such as cloud and server providers. Examples: >>> from sc_crawler.schemas import Vendor @@ -309,51 +335,43 @@ class CpuArchitecture(str, Enum): class Server(ScModel, table=True): - __table_args__ = {"comment": "Server types."} + """Server types.""" + id: str = Field( primary_key=True, - sa_column_kwargs={"comment": "Server identifier, as called at the vendor."}, + description="Server identifier, as called at the vendor.", ) vendor_id: str = Field( foreign_key="vendor.id", primary_key=True, - sa_column_kwargs={"comment": "Vendor reference."}, + description="Vendor reference.", ) name: str = Field( default=None, - sa_column_kwargs={ - "comment": "Human-friendly name or short description of the server." - }, + description="Human-friendly name or short description of the server.", ) vcpus: int = Field( default=None, - sa_column_kwargs={ - "comment": "Default number of virtual CPUs (vCPU) of the server." - }, + description="Default number of virtual CPUs (vCPU) of the server.", ) # TODO join all below cpu fields into a Cpu object? cpu_cores: int = Field( default=None, - sa_column_kwargs={ - "comment": ( - "Default number of CPU cores of the server. " - "Equals to vCPUs when HyperThreading is disabled." - ) - }, + description=( + "Default number of CPU cores of the server. " + "Equals to vCPUs when HyperThreading is disabled." + ), ) cpu_speed: Optional[float] = Field( - default=None, - sa_column_kwargs={"comment": "CPU clock speed (GHz)."}, + default=None, description="CPU clock speed (GHz)." ) cpu_architecture: CpuArchitecture = Field( default=None, - sa_column_kwargs={ - "comment": "CPU Architecture (arm64, arm64_mac, i386, or x86_64)." - }, + description="CPU Architecture (arm64, arm64_mac, i386, or x86_64).", ) cpu_manufacturer: Optional[str] = Field( default=None, - sa_column_kwargs={"comment": "The manufacturer of the processor."}, + description="The manufacturer of the processor.", ) # TODO add the below extra fields # cpu_features: # e.g. AVX; AVX2; AMD Turbo @@ -361,67 +379,57 @@ class Server(ScModel, table=True): # cpu_name: str # e.g. EPYC 7571 memory: int = Field( default=None, - sa_column_kwargs={"comment": "RAM amount (MiB)."}, + description="RAM amount (MiB).", ) gpu_count: int = Field( default=0, - sa_column_kwargs={"comment": "Number of GPU accelerator(s)."}, + description="Number of GPU accelerator(s).", ) # TODO sum and avg/each memory gpu_memory: Optional[int] = Field( default=None, - sa_column_kwargs={ - "comment": "Overall memory (MiB) available to all the GPU accelerator(s)." - }, + description="Overall memory (MiB) available to all the GPU accelerator(s).", ) gpu_name: Optional[str] = Field( default=None, - sa_column_kwargs={ - "comment": "The manufacturer and the name of the GPU accelerator(s)." - }, + description="The manufacturer and the name of the GPU accelerator(s).", ) gpus: List[Gpu] = Field( default=[], - sa_column=Column( - JSON, - comment=( - "JSON array of GPU accelerator details, including " - "the manufacturer, name, and memory (MiB) of each GPU." - ), + sa_column=Column(JSON), + description=( + "JSON array of GPU accelerator details, including " + "the manufacturer, name, and memory (MiB) of each GPU." ), ) storage_size: int = Field( default=0, - sa_column_kwargs={"comment": "Overall size (GB) of the disk(s)."}, + description="Overall size (GB) of the disk(s).", ) storage_type: Optional[StorageType] = Field( default=None, - sa_column_kwargs={"comment": "Disk type (hdd, ssd, nvme ssd, or network)."}, + description="Disk type (hdd, ssd, nvme ssd, or network).", ) storages: List[Storage] = Field( default=[], - sa_column=Column( - JSON, - comment=( - "JSON array of disks attached to the server, including " - "the size (MiB) and type of each disk." - ), + sa_column=Column(JSON), + description=( + "JSON array of disks attached to the server, including " + "the size (MiB) and type of each disk." ), ) network_speed: Optional[float] = Field( default=None, - sa_column_kwargs={ - "comment": "The baseline network performance (Gbps) of the network card." - }, + description="The baseline network performance (Gbps) of the network card.", ) billable_unit: str = Field( default=None, - sa_column_kwargs={"comment": "Time period for billing, e.g. hour or month."}, + description="Time period for billing, e.g. hour or month.", ) status: Status = Field( default=Status.ACTIVE, - sa_column_kwargs={"comment": "Status of the resource (active or inactive)."}, + description="Status of the resource (active or inactive).", ) vendor: Vendor = Relationship(back_populates="servers") From 400cf7655d17c22a26b37307277082d371c6a686 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 00:03:16 +0100 Subject: [PATCH 04/12] __fields__ deprecated --- src/sc_crawler/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 09ed1bc0..ea278797 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -43,7 +43,7 @@ def __init__(subclass, *args, **kwargs): if subclass.__doc__ and satable.comment is None: satable.comment = subclass.__doc__.splitlines()[0] # column comments - for k, v in subclass.__fields__.items(): + for k, v in subclass.model_fields.items(): comment = satable.columns[k].comment if v.description and comment is None: satable.columns[k].comment = v.description From ad19e699c8838e1a037381c9c19e3909868b9c49 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 13:39:26 +0100 Subject: [PATCH 05/12] record ipv4 for servers --- src/sc_crawler/schemas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index ea278797..fc03046b 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -422,6 +422,7 @@ class Server(ScModel, table=True): default=None, description="The baseline network performance (Gbps) of the network card.", ) + ipv4: bool = Field(default=False, description="Complimentary IPv4 address.") billable_unit: str = Field( default=None, From 5c947dbd99ab803abd70cb369198b7381f1744ee Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 13:44:31 +0100 Subject: [PATCH 06/12] drop addon prefix --- src/sc_crawler/schemas.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index fc03046b..462b421a 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -187,15 +187,16 @@ class Vendor(ScModel, table=True): country: Country = Relationship(back_populates="vendors") datacenters: List["Datacenter"] = Relationship(back_populates="vendor") zones: List["Zone"] = Relationship(back_populates="vendor") - addon_storages: List["AddonStorage"] = Relationship(back_populates="vendor") - addon_traffics: List["AddonTraffic"] = Relationship(back_populates="vendor") + storages: List["Storage"] = Relationship(back_populates="vendor") + traffics: List["Traffic"] = Relationship(back_populates="vendor") servers: List["Server"] = Relationship(back_populates="vendor") prices: List["Price"] = Relationship(back_populates="vendor") def __init__(self, **kwargs): super().__init__(**kwargs) try: - # TODO SQLModel does not validates pydantic typing + # SQLModel does not validates pydantic typing, + # only when writing to DB (much later in the process) if not self.id: raise ValueError("No vendor id provided") if not self.name: @@ -278,7 +279,7 @@ class StorageType(str, Enum): NETWORK = "network" -class AddonStorage(ScModel, table=True): +class Storage(ScModel, table=True): id: str = Field(primary_key=True) vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) name: str @@ -292,7 +293,7 @@ class AddonStorage(ScModel, table=True): billable_unit: str = "GiB" status: Status = Status.ACTIVE - vendor: Vendor = Relationship(back_populates="addon_storages") + vendor: Vendor = Relationship(back_populates="storages") prices: List["Price"] = Relationship(back_populates="storage") @@ -301,7 +302,7 @@ class TrafficDirection(str, Enum): OUT = "outbound" -class AddonTraffic(ScModel, table=True): +class Traffic(ScModel, table=True): id: str = Field(primary_key=True) vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) name: str @@ -310,7 +311,7 @@ class AddonTraffic(ScModel, table=True): billable_unit: str = "GB" status: Status = Status.ACTIVE - vendor: Vendor = Relationship(back_populates="addon_traffics") + vendor: Vendor = Relationship(back_populates="traffics") prices: List["Price"] = Relationship(back_populates="traffic") @@ -321,7 +322,7 @@ class Gpu(Json): firmware: Optional[str] = None -class Storage(Json): +class Disk(Json): size: int = 0 # GiB storage_type: StorageType @@ -410,7 +411,7 @@ class Server(ScModel, table=True): default=None, description="Disk type (hdd, ssd, nvme ssd, or network).", ) - storages: List[Storage] = Field( + storages: List[Disk] = Field( default=[], sa_column=Column(JSON), description=( @@ -460,8 +461,8 @@ class Price(ScModel, table=True): datacenter_id: Optional[str] = Field(default=None, foreign_key="datacenter.id") zone_id: Optional[str] = Field(default=None, foreign_key="zone.id") server_id: Optional[str] = Field(default=None, foreign_key="server.id") - traffic_id: Optional[str] = Field(default=None, foreign_key="addon_traffic.id") - storage_id: Optional[str] = Field(default=None, foreign_key="addon_storage.id") + traffic_id: Optional[str] = Field(default=None, foreign_key="traffic.id") + storage_id: Optional[str] = Field(default=None, foreign_key="storage.id") allocation: Allocation = Allocation.ONDEMAND price: float # max price if tiered # e.g. setup fee for dedicated servers, or upfront costs of a reserved instance type @@ -474,8 +475,8 @@ class Price(ScModel, table=True): datacenter: Datacenter = Relationship(back_populates="prices") zone: Zone = Relationship(back_populates="prices") server: Server = Relationship(back_populates="prices") - traffic: AddonTraffic = Relationship(back_populates="prices") - storage: AddonStorage = Relationship(back_populates="prices") + traffic: Traffic = Relationship(back_populates="prices") + storage: Storage = Relationship(back_populates="prices") @model_validator(mode="after") def server_or_traffic_or_storage(self) -> "Price": From 99a9b51508efcb42c4ed5efd41264953ff22c4f0 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 13:45:49 +0100 Subject: [PATCH 07/12] fix storage->disk --- src/sc_crawler/vendors/aws.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index bb90e384..eb0df100 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -9,7 +9,7 @@ from ..logger import logger from ..lookup import countries -from ..schemas import Datacenter, Gpu, Price, Server, Storage, Zone +from ..schemas import Datacenter, Gpu, Price, Server, Disk, Zone # disable caching by default set_default_params(caching_enabled=False, stale_after=timedelta(days=1)) @@ -540,7 +540,7 @@ def to_storage(disk, nvme=False): kind = disk.get("Type").lower() if kind == "ssd" and nvme: kind = "nvme ssd" - return Storage(size=disk["SizeInGB"], storage_type=kind) + return Disk(size=disk["SizeInGB"], storage_type=kind) # replicate number of disks disks = info["Disks"] From d1778a53138564083c329e51d7784f0e55ab4910 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 21:52:40 +0100 Subject: [PATCH 08/12] split Price table using helper data classes --- src/sc_crawler/schemas.py | 126 ++++++++++++++++++++++------------ src/sc_crawler/vendors/aws.py | 5 +- 2 files changed, 87 insertions(+), 44 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 462b421a..e7940435 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -12,13 +12,9 @@ BaseModel, ImportString, PrivateAttr, - model_validator, ) from sqlalchemy.inspection import inspect from sqlalchemy.orm import declared_attr - -# TODO SQLModel does NOT actually do pydantic validations -# https://github.com/tiangolo/sqlmodel/issues/52 from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select from .str import snake_case @@ -130,7 +126,7 @@ class ComplianceFramework(ScModel, table=True): abbreviation: Optional[str] description: Optional[str] # TODO HttpUrl not supported by SQLModel - # TODO upload to cdn.sparecores.com + # TODO upload to cdn.sparecores.com (s3/cloudfront) logo: Optional[str] = None # TODO HttpUrl not supported by SQLModel homepage: Optional[str] = None @@ -190,7 +186,9 @@ class Vendor(ScModel, table=True): storages: List["Storage"] = Relationship(back_populates="vendor") traffics: List["Traffic"] = Relationship(back_populates="vendor") servers: List["Server"] = Relationship(back_populates="vendor") - prices: List["Price"] = Relationship(back_populates="vendor") + server_prices: List["ServerPrice"] = Relationship(back_populates="vendor") + traffic_prices: List["TrafficPrice"] = Relationship(back_populates="vendor") + storage_prices: List["StoragePrice"] = Relationship(back_populates="vendor") def __init__(self, **kwargs): super().__init__(**kwargs) @@ -256,7 +254,9 @@ class Datacenter(ScModel, table=True): # relations country: Country = Relationship(back_populates="datacenters") zones: List["Zone"] = Relationship(back_populates="datacenter") - prices: List["Price"] = Relationship(back_populates="datacenter") + server_prices: List["ServerPrice"] = Relationship(back_populates="datacenter") + traffic_prices: List["TrafficPrice"] = Relationship(back_populates="datacenter") + storage_prices: List["StoragePrice"] = Relationship(back_populates="datacenter") class Zone(ScModel, table=True): @@ -269,7 +269,9 @@ class Zone(ScModel, table=True): # relations datacenter: Datacenter = Relationship(back_populates="zones") vendor: Vendor = Relationship(back_populates="zones") - prices: List["Price"] = Relationship(back_populates="zone") + server_prices: List["ServerPrice"] = Relationship(back_populates="zone") + traffic_prices: List["TrafficPrice"] = Relationship(back_populates="zone") + storage_prices: List["StoragePrice"] = Relationship(back_populates="zone") class StorageType(str, Enum): @@ -294,7 +296,7 @@ class Storage(ScModel, table=True): status: Status = Status.ACTIVE vendor: Vendor = Relationship(back_populates="storages") - prices: List["Price"] = Relationship(back_populates="storage") + prices: List["StoragePrice"] = Relationship(back_populates="storage") class TrafficDirection(str, Enum): @@ -312,7 +314,7 @@ class Traffic(ScModel, table=True): status: Status = Status.ACTIVE vendor: Vendor = Relationship(back_populates="traffics") - prices: List["Price"] = Relationship(back_populates="traffic") + prices: List["TrafficPrice"] = Relationship(back_populates="traffic") class Gpu(Json): @@ -397,7 +399,7 @@ class Server(ScModel, table=True): ) gpus: List[Gpu] = Field( default=[], - sa_column=Column(JSON), + sa_type=JSON, description=( "JSON array of GPU accelerator details, including " "the manufacturer, name, and memory (MiB) of each GPU." @@ -413,7 +415,7 @@ class Server(ScModel, table=True): ) storages: List[Disk] = Field( default=[], - sa_column=Column(JSON), + sa_type=JSON, description=( "JSON array of disks attached to the server, including " "the size (MiB) and type of each disk." @@ -435,7 +437,7 @@ class Server(ScModel, table=True): ) vendor: Vendor = Relationship(back_populates="servers") - prices: List["Price"] = Relationship(back_populates="server") + prices: List["ServerPrice"] = Relationship(back_populates="server") class Allocation(str, Enum): @@ -444,47 +446,87 @@ class Allocation(str, Enum): SPOT = "spot" +class Duration(str, Enum): + YEAR = "year" + MONTH = "month" + HOUR = "hour" + + class PriceTier(Json): lower: float upper: float price: float -class Price(ScModel, table=True): - ## TODO add ipv4 pricing - ## TODO created_at - id: int = Field(primary_key=True) - vendor_id: str = Field(foreign_key="vendor.id") - # a resource might be available in all or only in one/few - # datacenters and zones e.g. incoming traffic is priced per - # datacenter, but sport instance price per zone - datacenter_id: Optional[str] = Field(default=None, foreign_key="datacenter.id") - zone_id: Optional[str] = Field(default=None, foreign_key="zone.id") - server_id: Optional[str] = Field(default=None, foreign_key="server.id") - traffic_id: Optional[str] = Field(default=None, foreign_key="traffic.id") - storage_id: Optional[str] = Field(default=None, foreign_key="storage.id") - allocation: Allocation = Allocation.ONDEMAND - price: float # max price if tiered - # e.g. setup fee for dedicated servers, or upfront costs of a reserved instance type +# helper classes to inherit for most commonly used fields + + +class HasVendor(ScModel): + vendor_id: str = Field(foreign_key="vendor.id", primary_key=True) + + +class HasVendorOptionalDatacenterZone(HasVendor): + datacenter_id: str = Field( + default="", foreign_key="datacenter.id", primary_key=True + ) + zone_id: str = Field(default="", foreign_key="zone.id", primary_key=True) + + +class HasServer(ScModel): + server_id: str = Field(foreign_key="server.id", primary_key=True) + + +class HasStorage(ScModel): + storage_id: str = Field(foreign_key="storage.id", primary_key=True) + + +class HasTraffic(ScModel): + traffic_id: str = Field(foreign_key="traffic.id", primary_key=True) + + +class HasPriceFields(ScModel): + # set to max price if tiered + price: float + # e.g. setup fee for dedicated servers, + # or upfront costs of a reserved instance type price_upfront: float = 0 - # TODO needs time interval as well and other complications .. maybe skip for now? - price_tiered: List[PriceTier] = Field(default=[], sa_column=Column(JSON)) + price_tiered: List[PriceTier] = Field(default=[], sa_type=JSON) currency: str = "USD" + duration: Duration + - vendor: Vendor = Relationship(back_populates="prices") - datacenter: Datacenter = Relationship(back_populates="prices") - zone: Zone = Relationship(back_populates="prices") +class ServerPriceBase(HasPriceFields, HasServer, HasVendorOptionalDatacenterZone): + pass + + +class ServerPrice(ServerPriceBase, table=True): + vendor: Vendor = Relationship(back_populates="server_prices") + datacenter: Datacenter = Relationship(back_populates="server_prices") + zone: Zone = Relationship(back_populates="server_prices") server: Server = Relationship(back_populates="prices") - traffic: Traffic = Relationship(back_populates="prices") + + +class StoragePriceBase(HasPriceFields, HasStorage, HasVendorOptionalDatacenterZone): + pass + + +class StoragePrice(StoragePriceBase, table=True): + vendor: Vendor = Relationship(back_populates="storage_prices") + datacenter: Datacenter = Relationship(back_populates="storage_prices") + zone: Zone = Relationship(back_populates="storage_prices") storage: Storage = Relationship(back_populates="prices") - @model_validator(mode="after") - def server_or_traffic_or_storage(self) -> "Price": - if (self.server_id is None) + (self.traffic_id is None) + ( - self.storage_id is None - ) != 2: - raise ValueError("Exactly one Server, Traffic or Storage required.") - return self + +class TrafficPriceBase(HasPriceFields, HasTraffic, HasVendorOptionalDatacenterZone): + pass + + +class TrafficPrice(TrafficPriceBase, table=True): + vendor: Vendor = Relationship(back_populates="traffic_prices") + datacenter: Datacenter = Relationship(back_populates="traffic_prices") + zone: Zone = Relationship(back_populates="traffic_prices") + traffic: Traffic = Relationship(back_populates="prices") + VendorComplianceLink.model_rebuild() diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index eb0df100..4bce605b 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -9,7 +9,7 @@ from ..logger import logger from ..lookup import countries -from ..schemas import Datacenter, Gpu, Price, Server, Disk, Zone +from ..schemas import Datacenter, Gpu, ServerPrice, Duration, Server, Disk, Zone # disable caching by default set_default_params(caching_enabled=False, stale_after=timedelta(days=1)) @@ -674,13 +674,14 @@ def price_from_product(product, vendor): except Exception as exc: raise exc price = extract_ondemand_price(product["terms"]) - return Price( + return ServerPrice( vendor=vendor, datacenter=datacenter, server=server, allocation="ondemand", price=price[0], currency=price[1], + duration=Duration.HOUR, ) From d791b35398fe19970899557b2019e1b84397e52a Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 22:55:22 +0100 Subject: [PATCH 09/12] (auto)add inserted_at --- src/sc_crawler/schemas.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index e7940435..7c12100e 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -1,6 +1,7 @@ """Schemas for vendors, datacenters, zones, and other resources.""" +from datetime import datetime from enum import Enum from hashlib import sha1 from importlib import import_module @@ -13,6 +14,7 @@ ImportString, PrivateAttr, ) +from sqlalchemy import DateTime, Column from sqlalchemy.inspection import inspect from sqlalchemy.orm import declared_attr from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select @@ -20,13 +22,17 @@ from .str import snake_case -class ReuseDescriptions(SQLModel.__class__): - """Reuse description of the table and its fields as SQL comment. +class ScMetaModel(SQLModel.__class__): + """Custom class factory to auto-update table models. - Checking if the table and its fields have explicit comment set to - be shown in the `CREATE TABLE` statements, and if not, reuse the - optional table and field descriptions. Table docstrings are - truncated to first line. + - Reuse description of the table and its fields as SQL comment. + + Checking if the table and its fields have explicit comment set + to be shown in the `CREATE TABLE` statements, and if not, + reuse the optional table and field descriptions. Table + docstrings are truncated to first line. + + - Append inserted_at column. """ def __init__(subclass, *args, **kwargs): @@ -43,15 +49,18 @@ def __init__(subclass, *args, **kwargs): comment = satable.columns[k].comment if v.description and comment is None: satable.columns[k].comment = v.description + # append inserted_at as last column + satable.append_column(Column("inserted_at", DateTime, default=datetime.utcnow)) -class ScModel(SQLModel, metaclass=ReuseDescriptions): +class ScModel(SQLModel, metaclass=ScMetaModel): """Custom extensions to SQLModel objects and tables. Extra features: - auto-generated table names using snake_case, - support for hashing table rows, - - reuse description field of tables/columns as SQL comment. + - reuse description field of tables/columns as SQL comment, + - automatically append inserted_at column. """ @declared_attr # type: ignore @@ -72,6 +81,7 @@ def hash(cls, session, ignored: List[str] = ["inserted_at"]) -> dict: hashes = {} for row in rows: # NOTE Pydantic is warning when read Gpu/Storage as dict + # https://github.com/tiangolo/sqlmodel/issues/63#issuecomment-1081555082 rowdict = row.model_dump(warnings=False) rowkeys = str(tuple(rowdict.get(pk) for pk in pks)) for dropkey in [*ignored, *pks]: From e04655e079a96f3582ffa27c14efdfcefb7d7849 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 23:06:12 +0100 Subject: [PATCH 10/12] add OS to ServerPrice --- src/sc_crawler/schemas.py | 9 ++++++++- src/sc_crawler/vendors/aws.py | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 7c12100e..8ae03869 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -505,7 +505,14 @@ class HasPriceFields(ScModel): duration: Duration -class ServerPriceBase(HasPriceFields, HasServer, HasVendorOptionalDatacenterZone): +class ServerPriceExtraFields(ScModel): + operating_system: str + allocation: Allocation = Allocation.ONDEMAND + + +class ServerPriceBase( + HasPriceFields, ServerPriceExtraFields, HasServer, HasVendorOptionalDatacenterZone +): pass diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 4bce605b..73c305ce 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -66,7 +66,7 @@ def get_products(): # pricing API is only available in a few regions client = boto3.client("pricing", region_name="us-east-1") filters = { - # TODO mac instances? + # TODO ingest win, mac etc others "operatingSystem": "Linux", "preInstalledSw": "NA", "licenseModel": "No License required", @@ -678,6 +678,8 @@ def price_from_product(product, vendor): vendor=vendor, datacenter=datacenter, server=server, + # TODO ingest other OSs + operating_system="Linux", allocation="ondemand", price=price[0], currency=price[1], From 5cfd72bebccf59735f721c09fb861f6221a39cae Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Mon, 19 Feb 2024 23:06:51 +0100 Subject: [PATCH 11/12] track IPv4 price --- src/sc_crawler/schemas.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 8ae03869..1ebaad08 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -198,6 +198,7 @@ class Vendor(ScModel, table=True): servers: List["Server"] = Relationship(back_populates="vendor") server_prices: List["ServerPrice"] = Relationship(back_populates="vendor") traffic_prices: List["TrafficPrice"] = Relationship(back_populates="vendor") + ipv4_prices: List["Ipv4Price"] = Relationship(back_populates="vendor") storage_prices: List["StoragePrice"] = Relationship(back_populates="vendor") def __init__(self, **kwargs): @@ -266,6 +267,7 @@ class Datacenter(ScModel, table=True): zones: List["Zone"] = Relationship(back_populates="datacenter") server_prices: List["ServerPrice"] = Relationship(back_populates="datacenter") traffic_prices: List["TrafficPrice"] = Relationship(back_populates="datacenter") + ipv4_prices: List["Ipv4Price"] = Relationship(back_populates="datacenter") storage_prices: List["StoragePrice"] = Relationship(back_populates="datacenter") @@ -281,6 +283,7 @@ class Zone(ScModel, table=True): vendor: Vendor = Relationship(back_populates="zones") server_prices: List["ServerPrice"] = Relationship(back_populates="zone") traffic_prices: List["TrafficPrice"] = Relationship(back_populates="zone") + ipv4_prices: List["Ipv4Price"] = Relationship(back_populates="zone") storage_prices: List["StoragePrice"] = Relationship(back_populates="zone") @@ -545,6 +548,15 @@ class TrafficPrice(TrafficPriceBase, table=True): traffic: Traffic = Relationship(back_populates="prices") +class Ipv4PriceBase(HasPriceFields, HasVendorOptionalDatacenterZone): + pass + + +class Ipv4Price(Ipv4PriceBase, table=True): + vendor: Vendor = Relationship(back_populates="ipv4_prices") + datacenter: Datacenter = Relationship(back_populates="ipv4_prices") + zone: Zone = Relationship(back_populates="ipv4_prices") + VendorComplianceLink.model_rebuild() Country.model_rebuild() From fd24b582ea86ca5c589d310ce5dd0f5f59649ab3 Mon Sep 17 00:00:00 2001 From: "Gergely Daroczi (@daroczig)" Date: Tue, 20 Feb 2024 13:12:57 +0100 Subject: [PATCH 12/12] order and drop double imports --- src/sc_crawler/schemas.py | 2 +- src/sc_crawler/vendors/aws.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sc_crawler/schemas.py b/src/sc_crawler/schemas.py index 1ebaad08..3d883a9d 100644 --- a/src/sc_crawler/schemas.py +++ b/src/sc_crawler/schemas.py @@ -14,7 +14,7 @@ ImportString, PrivateAttr, ) -from sqlalchemy import DateTime, Column +from sqlalchemy import DateTime from sqlalchemy.inspection import inspect from sqlalchemy.orm import declared_attr from sqlmodel import JSON, Column, Field, Relationship, SQLModel, select diff --git a/src/sc_crawler/vendors/aws.py b/src/sc_crawler/vendors/aws.py index 73c305ce..a86a0b4d 100644 --- a/src/sc_crawler/vendors/aws.py +++ b/src/sc_crawler/vendors/aws.py @@ -9,7 +9,7 @@ from ..logger import logger from ..lookup import countries -from ..schemas import Datacenter, Gpu, ServerPrice, Duration, Server, Disk, Zone +from ..schemas import Datacenter, Disk, Duration, Gpu, Server, ServerPrice, Zone # disable caching by default set_default_params(caching_enabled=False, stale_after=timedelta(days=1))