Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactor] Lint bayesian pipeline #23

Merged
merged 4 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions endure.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python
import logging
import os
import toml
import sys
import toml
from typing import Any

from jobs.lcm_train import LCMTrainJob
from jobs.data_gen import DataGenJob
Expand All @@ -11,13 +12,13 @@


class EndureDriver:
def __init__(self, conf):
self.config = conf
def __init__(self, config: dict[str, Any]) -> None:
self.config = config

logging.basicConfig(
format=config["log"]["format"], datefmt=config["log"]["datefmt"]
)
self.log = logging.getLogger(config["log"]["name"])
self.log: logging.Logger = logging.getLogger(config["log"]["name"])
self.log.setLevel(logging.getLevelName(config["log"]["level"]))
log_level = logging.getLevelName(self.log.getEffectiveLevel())
self.log.debug(f"Log level: {log_level}")
Expand All @@ -38,7 +39,7 @@ def run(self):
self.log.warn(f"No job associated with {job_name}")
continue
job = job(config)
job.run()
_ = job.run()

self.log.info("All jobs finished, exiting")

Expand Down
37 changes: 14 additions & 23 deletions endure.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ data_dir = "/data"
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'
design = 'QFixed'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
Expand All @@ -71,12 +71,19 @@ elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same
E = 1024 # size of a single entry in bits
s = 1.905581e-8 # range query selectivity, 1 implies the full key range per query
B = 64.0 # number of physical entries per page
N = 522365629 # total number of key-val pairs for LSM tree
H = 5.705814 # total memory budget in bits per element
phi = 1.0 # read/write asymmetry coefficient, 1 implies w/r cost the same

# Default workload if not generating from random distribution
[lsm.workload]
z0 = 0.063
z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER JOB
Expand Down Expand Up @@ -191,8 +198,6 @@ batch_size = 1
# [ExpectedImprovement, UpperConfidenceBound, qExpectedImprovement]
acquisition_function = "ExpectedImprovement"
beta_value = 0.3
# model_type can take values - "Classic", "QFixed", "YZHybrid", "KHybrid"
model_type = "KHybrid"
# determines how many workloads do we want to test using the bayesian pipeline
multi_jobs_number = 100
multi_job_file = "design_comparison.csv"
Expand All @@ -207,20 +212,6 @@ db_path = "yz_databases"
# This must be a .db file for code to function. It will create a sqllite database
db_name = "yz_db_cost.db"

[job.BayesianOptimization.system]
E = 1024
s = 1.905581e-8
B = 64.0
N = 522365629
H = 5.705814
phi = 1.0

[job.BayesianOptimization.workload]
z0 = 0.063
z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER LCM
# Add configurations related to learned cost models
Expand Down
30 changes: 29 additions & 1 deletion endure/lcm/data/generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import random
from typing import List, Optional
from typing import List, Optional, Type
from itertools import combinations_with_replacement

import numpy as np
Expand Down Expand Up @@ -188,6 +188,18 @@ def _gen_row_data(self) -> list:
return line


class TieringGenerator(ClassicGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
self.policies = [Policy.Tiering]


class LevelingGenerator(ClassicGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
self.policies = [Policy.Leveling]


class KHybridGenerator(LCMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)
Expand Down Expand Up @@ -344,3 +356,19 @@ def _gen_row_data(self) -> list:
design.Z,
]
return line


def get_generator(choice: Policy) -> Type[LCMDataGenerator]:
choices = {
Policy.Tiering: TieringGenerator,
Policy.Leveling: LevelingGenerator,
Policy.Classic: ClassicGenerator,
Policy.QFixed: QCostGenerator,
Policy.YZHybrid: YZCostGenerator,
Policy.KHybrid: KHybridGenerator,
}
generator = choices.get(choice, None)
if generator is None:
raise KeyError

return generator
220 changes: 220 additions & 0 deletions endure/lsm/data_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import random
from typing import List, Optional, override
from itertools import combinations_with_replacement

import numpy as np

from endure.lsm.types import LSMDesign, System, Policy, LSMBounds, Workload
from endure.lsm.cost import EndureCost


class LSMDataGenerator:
# Memory budget to prevent bits_per_elem from hitting too close to max, and
# always ensuring write_buffer > 0
MEM_EPSILON = 0.1

def __init__(
self,
bounds: LSMBounds,
precision: int = 3,
) -> None:
self.precision = precision
self.bounds = bounds
self.max_levels = bounds.max_considered_levels
self.cf = EndureCost(max_levels=bounds.max_considered_levels)

def _sample_size_ratio(self) -> int:
low, high = self.bounds.size_ratio_range
return np.random.randint(low=low, high=high)

def _sample_bloom_filter_bits(self, max: Optional[float] = None) -> float:
if max is None:
max = self.bounds.bits_per_elem_range[1]
min = self.bounds.bits_per_elem_range[0]
sample = (max - min) * np.random.rand() + min
return np.around(sample, self.precision)

# TODO: Will want to configure environment to simulate larger ranges over
# potential system values
def _sample_entry_per_page(self, entry_size: int = 8192) -> int:
# Potential page sizes are 4KB, 8KB, 16KB
KB_TO_BITS = 8 * 1024
page_sizes = np.array(self.bounds.page_sizes)
entries_per_page = (page_sizes * KB_TO_BITS) / entry_size
return np.random.choice(entries_per_page)

def _sample_selectivity(self) -> float:
low, high = self.bounds.selectivity_range
return (high - low) * np.random.rand() + low

def _sample_entry_size(self) -> int:
return np.random.choice(self.bounds.entry_sizes)

def _sample_memory_budget(self) -> float:
low, high = self.bounds.memory_budget_range
return (high - low) * np.random.rand() + low

def _sample_total_elements(self) -> int:
low, high = self.bounds.elements_range
return np.random.randint(low=low, high=high)

def sample_system(self) -> System:
E = self._sample_entry_size()
B = self._sample_entry_per_page(entry_size=E)
s = self._sample_selectivity()
H = self._sample_memory_budget()
N = self._sample_total_elements()
system = System(E, s, B, N, H)

return system

def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T)

return lsm

def sample_workload(self, dimensions: int) -> Workload:
# See stackoverflow thread for why the simple solution is not uniform
# https://stackoverflow.com/questions/8064629
workload = np.around(np.random.rand(dimensions - 1), self.precision)
workload = np.concatenate((workload, np.array([0, 1])))
workload = np.sort(workload)

workload = [b - a for a, b in zip(workload, workload[1:])]
return Workload(*workload)


class TieringGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
**kwargs,
):
super().__init__(bounds, **kwargs)
self.policies = policies

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T, policy=Policy.Tiering)

return lsm


class LevelingGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
policies: List[Policy] = [Policy.Tiering, Policy.Leveling],
**kwargs,
):
super().__init__(bounds, **kwargs)
self.policies = policies

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
lsm = LSMDesign(h, T, policy=Policy.Leveling)

return lsm


class ClassicGenerator(LSMDataGenerator):
def __init__(
self,
bounds: LSMBounds,
**kwargs,
):
super().__init__(bounds, **kwargs)

@override
def sample_design(
self,
system: System,
) -> LSMDesign:
h = self._sample_bloom_filter_bits(max=(system.H - self.MEM_EPSILON))
T = self._sample_size_ratio()
policy = random.choice((Policy.Tiering, Policy.Leveling))
lsm = LSMDesign(h=h, T=T, policy=policy)

return lsm


class KHybridGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _gen_k_levels(self, levels: int, max_size_ratio: int) -> list:
arr = combinations_with_replacement(range(max_size_ratio, 0, -1), levels)

return list(arr)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
levels = int(self.cf.L(design, system, ceil=True))
k = np.random.randint(low=1, high=int(T), size=(levels))
remaining = np.ones(self.max_levels - len(k))
k = np.concatenate([k, remaining])
design = LSMDesign(h=h, T=T, policy=Policy.KHybrid, K=k.tolist())

return design


class QCostGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _sample_q(self, max_size_ratio: int) -> int:
return np.random.randint(
low=self.bounds.size_ratio_range[0] - 1,
high=max_size_ratio,
)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
Q = self._sample_q(int(T))
design = LSMDesign(h=h, T=T, policy=Policy.QFixed, Q=Q)

return design


class YZCostGenerator(LSMDataGenerator):
def __init__(self, bounds: LSMBounds, **kwargs):
super().__init__(bounds, **kwargs)

def _sample_capacity(self, max_size_ratio: int) -> int:
return np.random.randint(
low=self.bounds.size_ratio_range[0] - 1,
high=max_size_ratio,
)

@override
def sample_design(self, system: System) -> LSMDesign:
design = super().sample_design(system)
h = design.h
T = design.T
Y = self._sample_capacity(int(T))
Z = self._sample_capacity(int(T))
design = LSMDesign(h=h, T=T, policy=Policy.YZHybrid, Y=Y, Z=Z)

return design
20 changes: 20 additions & 0 deletions endure/lsm/solver/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,24 @@
from typing import Type
from endure.lsm.types import Policy
from .classic_solver import ClassicSolver
from .qlsm_solver import QLSMSolver
from .klsm_solver import KLSMSolver
from .yzlsm_solver import YZLSMSolver


def get_solver(
choice: Policy,
) -> Type[ClassicSolver | QLSMSolver | KLSMSolver | YZLSMSolver]:
choices = {
Policy.Tiering: ClassicSolver,
Policy.Leveling: ClassicSolver,
Policy.Classic: ClassicSolver,
Policy.QFixed: QLSMSolver,
Policy.YZHybrid: YZLSMSolver,
Policy.KHybrid: KLSMSolver,
}
solver = choices.get(choice, None)
if solver is None:
raise KeyError

return solver
Loading
Loading