[Refactor] Lint bayesian pipeline (#23)

* [Refactor] Add leveling and tiering specific generators with a selection function * [Linter] Running black on files * [Toml] Add default lsm workload * [Linter] Lint bayesian pipeline code
BU-DiSC · Apr 22, 2024 · bc4e806 · bc4e806
1 parent c8c18d2
commit bc4e806
Show file tree

Hide file tree

Showing 9 changed files with 810 additions and 253 deletions.
diff --git a/endure.py b/endure.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
 import logging
 import os
-import toml
 import sys
+import toml
+from typing import Any
 
 from jobs.lcm_train import LCMTrainJob
 from jobs.data_gen import DataGenJob
@@ -11,13 +12,13 @@
 
 
 class EndureDriver:
-    def __init__(self, conf):
-        self.config = conf
+    def __init__(self, config: dict[str, Any]) -> None:
+        self.config = config
 
         logging.basicConfig(
             format=config["log"]["format"], datefmt=config["log"]["datefmt"]
         )
-        self.log = logging.getLogger(config["log"]["name"])
+        self.log: logging.Logger = logging.getLogger(config["log"]["name"])
         self.log.setLevel(logging.getLevelName(config["log"]["level"]))
         log_level = logging.getLevelName(self.log.getEffectiveLevel())
         self.log.debug(f"Log level: {log_level}")
@@ -38,7 +39,7 @@ def run(self):
                 self.log.warn(f"No job associated with {job_name}")
                 continue
             job = job(config)
-            job.run()
+            _ = job.run()
 
         self.log.info("All jobs finished, exiting")
 

diff --git a/endure.toml b/endure.toml
@@ -58,7 +58,7 @@ data_dir = "/data"
 #   QFixed - Levels 1 -> L = Q
 #   YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
 #   KHybrid - Each level has own K_i decision
-design = 'KHybrid'
+design = 'QFixed'
 
 [lsm.bounds]
 max_considered_levels = 20                  # Max number of levels to consider
@@ -71,12 +71,19 @@ elements_range = [100000000, 1000000000]    # element range
 
 # Default system values if not generating random systems
 [lsm.system]
-E = 8192        # size of a single entry in bits
-s = 2e-7        # range query selectivity, 1 implies the full key range per query
-B = 4           # number of physical entries per page
-N = 1000000000  # total number of key-val pairs for LSM tree
-H = 10          # total memory budget in bits per element
-phi = 1         # read/write asymmetry coefficient, 1 implies w/r cost the same
+E = 1024           # size of a single entry in bits
+s = 1.905581e-8    # range query selectivity, 1 implies the full key range per query
+B = 64.0           # number of physical entries per page
+N = 522365629      # total number of key-val pairs for LSM tree
+H = 5.705814       # total memory budget in bits per element
+phi = 1.0          # read/write asymmetry coefficient, 1 implies w/r cost the same
+
+# Default workload if not generating from random distribution
+[lsm.workload]
+z0 = 0.063
+z1 = 0.190
+q = 0.545
+w = 0.202
 
 # =============================================================================
 # HEADER JOB
@@ -191,8 +198,6 @@ batch_size = 1
 # [ExpectedImprovement, UpperConfidenceBound, qExpectedImprovement]
 acquisition_function = "ExpectedImprovement"
 beta_value = 0.3
-# model_type can take values - "Classic", "QFixed", "YZHybrid", "KHybrid"
-model_type = "KHybrid"
 # determines how many workloads do we want to test using the bayesian pipeline
 multi_jobs_number = 100
 multi_job_file = "design_comparison.csv"
@@ -207,20 +212,6 @@ db_path = "yz_databases"
 # This must be a .db file for code to function. It will create a sqllite database
 db_name = "yz_db_cost.db"
 
-[job.BayesianOptimization.system]
-E = 1024
-s = 1.905581e-8
-B = 64.0
-N = 522365629
-H = 5.705814
-phi = 1.0
-
-[job.BayesianOptimization.workload]
-z0 = 0.063
-z1 = 0.190
-q = 0.545
-w = 0.202
-
 # =============================================================================
 # HEADER LCM
 #   Add configurations related to learned cost models

diff --git a/endure/lcm/data/generator.py b/endure/lcm/data/generator.py
@@ -1,5 +1,5 @@
 import random
-from typing import List, Optional
+from typing import List, Optional, Type
 from itertools import combinations_with_replacement
 
 import numpy as np
@@ -188,6 +188,18 @@ def _gen_row_data(self) -> list:
         return line
 
 
+class TieringGenerator(ClassicGenerator):
+    def __init__(self, bounds: LSMBounds, **kwargs):
+        super().__init__(bounds, **kwargs)
+        self.policies = [Policy.Tiering]
+
+
+class LevelingGenerator(ClassicGenerator):
+    def __init__(self, bounds: LSMBounds, **kwargs):
+        super().__init__(bounds, **kwargs)
+        self.policies = [Policy.Leveling]
+
+
 class KHybridGenerator(LCMDataGenerator):
     def __init__(self, bounds: LSMBounds, **kwargs):
         super().__init__(bounds, **kwargs)
@@ -344,3 +356,19 @@ def _gen_row_data(self) -> list:
             design.Z,
         ]
         return line
+
+
+def get_generator(choice: Policy) -> Type[LCMDataGenerator]:
+    choices = {
+        Policy.Tiering: TieringGenerator,
+        Policy.Leveling: LevelingGenerator,
+        Policy.Classic: ClassicGenerator,
+        Policy.QFixed: QCostGenerator,
+        Policy.YZHybrid: YZCostGenerator,
+        Policy.KHybrid: KHybridGenerator,
+    }
+    generator = choices.get(choice, None)
+    if generator is None:
+        raise KeyError
+
+    return generator
diff --git a/endure/lsm/data_generator.py b/endure/lsm/data_generator.py
@@ -0,0 +1,220 @@
+import random
+from typing import List, Optional, override
+from itertools import combinations_with_replacement
+
+import numpy as np
+
+from endure.lsm.types import LSMDesign, System, Policy, LSMBounds, Workload
+from endure.lsm.cost import EndureCost
+
+
+class LSMDataGenerator:
+    # Memory budget to prevent bits_per_elem from hitting too close to max, and
+    # always ensuring write_buffer > 0
+    MEM_EPSILON = 0.1
+
+    def __init__(
+        self,
+        bounds: LSMBounds,
+        precision: int = 3,
+    ) -> None:
+        self.precision = precision
+        self.bounds = bounds
+        self.max_levels = bounds.max_considered_levels
+        self.cf = EndureCost(max_levels=bounds.max_considered_levels)
+
+    def _sample_size_ratio(self) -> int:
+        low, high = self.bounds.size_ratio_range
+        return np.random.randint(low=low, high=high)
+
+    def _sample_bloom_filter_bits(self, max: Optional[float] = None) -> float:
+        if max is None:
+            max = self.bounds.bits_per_elem_range[1]
+        min = self.bounds.bits_per_elem_range[0]
+        sample = (max - min) * np.random.rand() + min
+        return np.around(sample, self.precision)
+
+    # TODO: Will want to configure environment to simulate larger ranges over
+    # potential system values
+    def _sample_entry_per_page(self, entry_size: int = 8192) -> int:
+        # Potential page sizes are 4KB, 8KB, 16KB
+        KB_TO_BITS = 8 * 1024
+        page_sizes = np.array(self.bounds.page_sizes)
+        entries_per_page = (page_sizes * KB_TO_BITS) / entry_size
+        return np.random.choice(entries_per_page)
+
+    def _sample_selectivity(self) -> float:
+        low, high = self.bounds.selectivity_range
+        return (high - low) * np.random.rand() + low
+
+    def _sample_entry_size(self) -> int:
+        return np.random.choice(self.bounds.entry_sizes)
+
+    def _sample_memory_budget(self) -> float:
+        low, high = self.bounds.memory_budget_range
+        return (high - low) * np.random.rand() + low
+
+    def _sample_total_elements(self) -> int:
+        low, high = self.bounds.elements_range
+        return np.random.randint(low=low, high=high)
+
+    def sample_system(self) -> System:
+        E = self._sample_entry_size()
+        B = self._sample_entry_per_page(entry_size=E)
+        s = self._sample_selectivity()
+        H = self._sample_memory_budget()
+        N = self._sample_total_elements()
+        system = System(E, s, B, N, H)
+
+        return system
+
+    def sample_design(
+        self,
+        system: System,
+    ) -> LSMDesign:
+        h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
+        T = self._sample_size_ratio()
+        lsm = LSMDesign(h, T)
+
+        return lsm
+
+    def sample_workload(self, dimensions: int) -> Workload:
+        # See stackoverflow thread for why the simple solution is not uniform
+        # https://stackoverflow.com/questions/8064629
+        workload = np.around(np.random.rand(dimensions - 1), self.precision)
+        workload = np.concatenate((workload, np.array([0, 1])))
+        workload = np.sort(workload)
+
+        workload = [b - a for a, b in zip(workload, workload[1:])]
+        return Workload(*workload)
+
+
+class TieringGenerator(LSMDataGenerator):
+    def __init__(
+        self,
+        bounds: LSMBounds,
+        policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
+        **kwargs,
+    ):
+        super().__init__(bounds, **kwargs)
+        self.policies = policies
+
+    @override
+    def sample_design(
+        self,
+        system: System,
+    ) -> LSMDesign:
+        h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
+        T = self._sample_size_ratio()
+        lsm = LSMDesign(h, T, policy=Policy.Tiering)
+
+        return lsm
+
+
+class LevelingGenerator(LSMDataGenerator):
+    def __init__(
+        self,
+        bounds: LSMBounds,
+        policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
+        **kwargs,
+    ):
+        super().__init__(bounds, **kwargs)
+        self.policies = policies
+
+    @override
+    def sample_design(
+        self,
+        system: System,
+    ) -> LSMDesign:
+        h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
+        T = self._sample_size_ratio()
+        lsm = LSMDesign(h, T, policy=Policy.Leveling)
+
+        return lsm
+
+
+class ClassicGenerator(LSMDataGenerator):
+    def __init__(
+        self,
+        bounds: LSMBounds,
+        **kwargs,
+    ):
+        super().__init__(bounds, **kwargs)
+
+    @override
+    def sample_design(
+        self,
+        system: System,
+    ) -> LSMDesign:
+        h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
+        T = self._sample_size_ratio()
+        policy = random.choice((Policy.Tiering, Policy.Leveling))
+        lsm = LSMDesign(h=h, T=T, policy=policy)
+
+        return lsm
+
+
+class KHybridGenerator(LSMDataGenerator):
+    def __init__(self, bounds: LSMBounds, **kwargs):
+        super().__init__(bounds, **kwargs)
+
+    def _gen_k_levels(self, levels: int, max_size_ratio: int) -> list:
+        arr = combinations_with_replacement(range(max_size_ratio, 0, -1), levels)
+
+        return list(arr)
+
+    @override
+    def sample_design(self, system: System) -> LSMDesign:
+        design = super().sample_design(system)
+        h = design.h
+        T = design.T
+        levels = int(self.cf.L(design, system, ceil=True))
+        k = np.random.randint(low=1, high=int(T), size=(levels))
+        remaining = np.ones(self.max_levels - len(k))
+        k = np.concatenate([k, remaining])
+        design = LSMDesign(h=h, T=T, policy=Policy.KHybrid, K=k.tolist())
+
+        return design
+
+
+class QCostGenerator(LSMDataGenerator):
+    def __init__(self, bounds: LSMBounds, **kwargs):
+        super().__init__(bounds, **kwargs)
+
+    def _sample_q(self, max_size_ratio: int) -> int:
+        return np.random.randint(
+            low=self.bounds.size_ratio_range[0] - 1,
+            high=max_size_ratio,
+        )
+
+    @override
+    def sample_design(self, system: System) -> LSMDesign:
+        design = super().sample_design(system)
+        h = design.h
+        T = design.T
+        Q = self._sample_q(int(T))
+        design = LSMDesign(h=h, T=T, policy=Policy.QFixed, Q=Q)
+
+        return design
+
+
+class YZCostGenerator(LSMDataGenerator):
+    def __init__(self, bounds: LSMBounds, **kwargs):
+        super().__init__(bounds, **kwargs)
+
+    def _sample_capacity(self, max_size_ratio: int) -> int:
+        return np.random.randint(
+            low=self.bounds.size_ratio_range[0] - 1,
+            high=max_size_ratio,
+        )
+
+    @override
+    def sample_design(self, system: System) -> LSMDesign:
+        design = super().sample_design(system)
+        h = design.h
+        T = design.T
+        Y = self._sample_capacity(int(T))
+        Z = self._sample_capacity(int(T))
+        design = LSMDesign(h=h, T=T, policy=Policy.YZHybrid, Y=Y, Z=Z)
+
+        return design
diff --git a/endure/lsm/solver/__init__.py b/endure/lsm/solver/__init__.py
@@ -1,4 +1,24 @@
+from typing import Type
+from endure.lsm.types import Policy
 from .classic_solver import ClassicSolver
 from .qlsm_solver import QLSMSolver
 from .klsm_solver import KLSMSolver
 from .yzlsm_solver import YZLSMSolver
+
+
+def get_solver(
+    choice: Policy,
+) -> Type[ClassicSolver | QLSMSolver | KLSMSolver | YZLSMSolver]:
+    choices = {
+        Policy.Tiering: ClassicSolver,
+        Policy.Leveling: ClassicSolver,
+        Policy.Classic: ClassicSolver,
+        Policy.QFixed: QLSMSolver,
+        Policy.YZHybrid: YZLSMSolver,
+        Policy.KHybrid: KLSMSolver,
+    }
+    solver = choices.get(choice, None)
+    if solver is None:
+        raise KeyError
+
+    return solver