Skip to content

Commit

Permalink
[Refactor] Lint bayesian pipeline (#23)
Browse files Browse the repository at this point in the history
* [Refactor] Add leveling and tiering specific generators with a selection function
* [Linter] Running black on files
* [Toml] Add default lsm workload
* [Linter] Lint bayesian pipeline code
  • Loading branch information
ephoris authored Apr 22, 2024
1 parent c8c18d2 commit bc4e806
Show file tree
Hide file tree
Showing 9 changed files with 810 additions and 253 deletions.
11 changes: 6 additions & 5 deletions endure.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python
import logging
import os
import toml
import sys
import toml
from typing import Any

from jobs.lcm_train import LCMTrainJob
from jobs.data_gen import DataGenJob
Expand All @@ -11,13 +12,13 @@


class EndureDriver:
def __init__(self, conf):
self.config = conf
def __init__(self, config: dict[str, Any]) -> None:
self.config = config

logging.basicConfig(
format=config["log"]["format"], datefmt=config["log"]["datefmt"]
)
self.log = logging.getLogger(config["log"]["name"])
self.log: logging.Logger = logging.getLogger(config["log"]["name"])
self.log.setLevel(logging.getLevelName(config["log"]["level"]))
log_level = logging.getLevelName(self.log.getEffectiveLevel())
self.log.debug(f"Log level: {log_level}")
Expand All @@ -38,7 +39,7 @@ def run(self):
self.log.warn(f"No job associated with {job_name}")
continue
job = job(config)
job.run()
_ = job.run()

self.log.info("All jobs finished, exiting")

Expand Down
37 changes: 14 additions & 23 deletions endure.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ data_dir = "/data"
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'
design = 'QFixed'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
Expand All @@ -71,12 +71,19 @@ elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same
E = 1024 # size of a single entry in bits
s = 1.905581e-8 # range query selectivity, 1 implies the full key range per query
B = 64.0 # number of physical entries per page
N = 522365629 # total number of key-val pairs for LSM tree
H = 5.705814 # total memory budget in bits per element
phi = 1.0 # read/write asymmetry coefficient, 1 implies w/r cost the same

# Default workload if not generating from random distribution
[lsm.workload]
z0 = 0.063
z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER JOB
Expand Down Expand Up @@ -191,8 +198,6 @@ batch_size = 1
# [ExpectedImprovement, UpperConfidenceBound, qExpectedImprovement]
acquisition_function = "ExpectedImprovement"
beta_value = 0.3
# model_type can take values - "Classic", "QFixed", "YZHybrid", "KHybrid"
model_type = "KHybrid"
# determines how many workloads do we want to test using the bayesian pipeline
multi_jobs_number = 100
multi_job_file = "design_comparison.csv"
Expand All @@ -207,20 +212,6 @@ db_path = "yz_databases"
# This must be a .db file for code to function. It will create a sqllite database
db_name = "yz_db_cost.db"

[job.BayesianOptimization.system]
E = 1024
s = 1.905581e-8
B = 64.0
N = 522365629
H = 5.705814
phi = 1.0

[job.BayesianOptimization.workload]
z0 = 0.063
z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER LCM
# Add configurations related to learned cost models
Expand Down
30 changes: 29 additions & 1 deletion endure/lcm/data/generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import random
from typing import List, Optional
from typing import List, Optional, Type
from itertools import combinations_with_replacement

import numpy as np
Expand Down Expand Up @@ -188,6 +188,18 @@ def _gen_row_data(self) -> list:
return line


class TieringGenerator(ClassicGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
self.policies = [Policy.Tiering]


class LevelingGenerator(ClassicGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
self.policies = [Policy.Leveling]


class KHybridGenerator(LCMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
Expand Down Expand Up @@ -344,3 +356,19 @@ def _gen_row_data(self) -> list:
design.Z,
]
return line


def get_generator(choice: Policy) -> Type[LCMDataGenerator]:
choices = {
Policy.Tiering: TieringGenerator,
Policy.Leveling: LevelingGenerator,
Policy.Classic: ClassicGenerator,
Policy.QFixed: QCostGenerator,
Policy.YZHybrid: YZCostGenerator,
Policy.KHybrid: KHybridGenerator,
}
generator = choices.get(choice, None)
if generator is None:
raise KeyError

return generator
220 changes: 220 additions & 0 deletions endure/lsm/data_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import random
from typing import List, Optional, override
from itertools import combinations_with_replacement

import numpy as np

from endure.lsm.types import LSMDesign, System, Policy, LSMBounds, Workload
from endure.lsm.cost import EndureCost


class LSMDataGenerator:
# Memory budget to prevent bits_per_elem from hitting too close to max, and
# always ensuring write_buffer > 0
MEM_EPSILON = 0.1

def __init__(
self,
bounds: LSMBounds,
precision: int = 3,
) -> None:
self.precision = precision
self.bounds = bounds
self.max_levels = bounds.max_considered_levels
self.cf = EndureCost(max_levels=bounds.max_considered_levels)

def _sample_size_ratio(self) -> int:
low, high = self.bounds.size_ratio_range
return np.random.randint(low=low, high=high)

def _sample_bloom_filter_bits(self, max: Optional[float] = None) -> float:
if max is None:
max = self.bounds.bits_per_elem_range[1]
min = self.bounds.bits_per_elem_range[0]
sample = (max - min) * np.random.rand() + min
return np.around(sample, self.precision)

# TODO: Will want to configure environment to simulate larger ranges over
# potential system values
def _sample_entry_per_page(self, entry_size: int = 8192) -> int:
# Potential page sizes are 4KB, 8KB, 16KB
KB_TO_BITS = 8 * 1024
page_sizes = np.array(self.bounds.page_sizes)
entries_per_page = (page_sizes * KB_TO_BITS) / entry_size
return np.random.choice(entries_per_page)

def _sample_selectivity(self) -> float:
low, high = self.bounds.selectivity_range
return (high - low) * np.random.rand() + low

def _sample_entry_size(self) -> int:
return np.random.choice(self.bounds.entry_sizes)

def _sample_memory_budget(self) -> float:
low, high = self.bounds.memory_budget_range
return (high - low) * np.random.rand() + low

def _sample_total_elements(self) -> int:
low, high = self.bounds.elements_range
return np.random.randint(low=low, high=high)

def sample_system(self) -> System:
E = self._sample_entry_size()
B = self._sample_entry_per_page(entry_size=E)
s = self._sample_selectivity()
H = self._sample_memory_budget()
N = self._sample_total_elements()
system = System(E, s, B, N, H)

return system

def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T)

return lsm

def sample_workload(self, dimensions: int) -> Workload:
# See stackoverflow thread for why the simple solution is not uniform
# https://stackoverflow.com/questions/8064629
workload = np.around(np.random.rand(dimensions - 1), self.precision)
workload = np.concatenate((workload, np.array([0, 1])))
workload = np.sort(workload)

workload = [b - a for a, b in zip(workload, workload[1:])]
return Workload(*workload)


class TieringGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
**kwargs,
):
super().__init__(bounds, **kwargs)
self.policies = policies

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T, policy=Policy.Tiering)

return lsm


class LevelingGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
**kwargs,
):
super().__init__(bounds, **kwargs)
self.policies = policies

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T, policy=Policy.Leveling)

return lsm


class ClassicGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
**kwargs,
):
super().__init__(bounds, **kwargs)

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
policy = random.choice((Policy.Tiering, Policy.Leveling))
lsm = LSMDesign(h=h, T=T, policy=policy)

return lsm


class KHybridGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _gen_k_levels(self, levels: int, max_size_ratio: int) -> list:
arr = combinations_with_replacement(range(max_size_ratio, 0, -1), levels)

return list(arr)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
levels = int(self.cf.L(design, system, ceil=True))
k = np.random.randint(low=1, high=int(T), size=(levels))
remaining = np.ones(self.max_levels - len(k))
k = np.concatenate([k, remaining])
design = LSMDesign(h=h, T=T, policy=Policy.KHybrid, K=k.tolist())

return design


class QCostGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _sample_q(self, max_size_ratio: int) -> int:
return np.random.randint(
low=self.bounds.size_ratio_range[0] - 1,
high=max_size_ratio,
)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
Q = self._sample_q(int(T))
design = LSMDesign(h=h, T=T, policy=Policy.QFixed, Q=Q)

return design


class YZCostGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _sample_capacity(self, max_size_ratio: int) -> int:
return np.random.randint(
low=self.bounds.size_ratio_range[0] - 1,
high=max_size_ratio,
)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
Y = self._sample_capacity(int(T))
Z = self._sample_capacity(int(T))
design = LSMDesign(h=h, T=T, policy=Policy.YZHybrid, Y=Y, Z=Z)

return design
20 changes: 20 additions & 0 deletions endure/lsm/solver/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,24 @@
from typing import Type
from endure.lsm.types import Policy
from .classic_solver import ClassicSolver
from .qlsm_solver import QLSMSolver
from .klsm_solver import KLSMSolver
from .yzlsm_solver import YZLSMSolver


def get_solver(
choice: Policy,
) -> Type[ClassicSolver | QLSMSolver | KLSMSolver | YZLSMSolver]:
choices = {
Policy.Tiering: ClassicSolver,
Policy.Leveling: ClassicSolver,
Policy.Classic: ClassicSolver,
Policy.QFixed: QLSMSolver,
Policy.YZHybrid: YZLSMSolver,
Policy.KHybrid: KLSMSolver,
}
solver = choices.get(choice, None)
if solver is None:
raise KeyError

return solver
Loading

0 comments on commit bc4e806

Please sign in to comment.