Skip to content

Commit

Permalink
[Refactor] Clean up jobs with toml file (#22)
Browse files Browse the repository at this point in the history
* [Refactor] Remove extra configs
* [Reactor] Clean up ltune data gen
* [Refactor] Clean up toml file descriptions
* [Refactor] Remove individual data generators
* [Refactor] Clean up LTuneTrain building
* [Refactor] Add to descriptions in the `endure.toml` default example
  • Loading branch information
ephoris authored Mar 11, 2024
1 parent 95f872c commit c8c18d2
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 428 deletions.
6 changes: 2 additions & 4 deletions endure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import toml
import sys

from jobs.lcm_data_gen import LCMDataGenJob
from jobs.lcm_train import LCMTrainJob
from jobs.ltune_data_gen import LTuneDataGenJob
from jobs.data_gen import DataGenJob
from jobs.ltune_train import LTuneTrainJob
from jobs.bayesian_pipeline import BayesianPipeline

Expand All @@ -27,9 +26,8 @@ def run(self):
self.log.info(f'Staring app {self.config["app"]["name"]}')

jobs = {
"LCMDataGen": LCMDataGenJob,
"DataGen": DataGenJob,
"LCMTrain": LCMTrainJob,
"LTuneDataGen": LTuneDataGenJob,
"LTuneTrain": LTuneTrainJob,
"BayesianBaseline": BayesianPipeline,
}
Expand Down
154 changes: 67 additions & 87 deletions endure.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
# =============================================================================
# ENDURE Configuration File
# Every job will contain it's own HEADER <Job Title> with settings appropiate
# for the job.
#
# Following subsections are available
# APP
# LOGGER - output setting
# IO - base directory for IO
# LSM - Log structured merge tree assumptions and settings
# JOB - all job specific settings
# LCM - Learned cost model specifics
# LTune - Learned tuner specifics
# SCHEDULERS - ML learning rate schduler kwargs
# OPTIMIZERS - ML optimizer kwargs
# LOSS - ML Loss function kwargs
# =============================================================================

# =============================================================================
# HEADER APP
# Logic of app including jobs list to run
# =============================================================================
[app]
name = "ENDURE"
run = [
# "LCMDataGen",
# "DataGen",
# "LCMTrain",
# "LTuneDataGen",
# "LTuneTrain",
# "BayesianBaseline"
]
Expand All @@ -34,6 +44,40 @@ disable_tqdm = false
[io]
data_dir = "/data"

# =============================================================================
# HEADER LSM
# Generic LSM settings including maximum bounds, system settings, starting
# budget for memory, number of elements, etc
# =============================================================================
[lsm]
# Design will effect everything else down stream (e.g. choice of neural network
# architecture for learned cost model)
# Tiering
# Leveling
# Classic - Considers both leveing and tiering
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
size_ratio_range = [2, 31] # low, high of size ratios to consider
page_sizes = [4, 8, 16] # KB pages
entry_sizes = [1024, 2048, 4096, 8192] # bits
memory_budget_range = [5, 20] # low, high, bits per element
selectivity_range = [1e-7, 1e-9] # low, high
elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same

# =============================================================================
# HEADER JOB
# Settings for each individual job (executable)
Expand All @@ -42,24 +86,22 @@ data_dir = "/data"
use_gpu_if_avail = false

# -----------------------------------------------------------------------------
[job.LCMDataGen]
[job.DataGen]
# -----------------------------------------------------------------------------
dir = "test-data/kcost-t30"
file_prefix = "kcost"
num_workers = -1 # -1 forces all cores to be used
num_files = 2
samples = 1024 # per file sample
overwrite_if_exists = true
dir = "lcm/test/std"
generator = "LTuner" # Select between data for tuner (LTuner) or LCM
file_prefix = "tuner" # all files named file_prefix_000X.parquet
num_workers = -1 # -1 forces all cores to be used
num_files = 2 # number of files to generate
samples = 1024 # per file sample
overwrite_if_exists = true # if files exist overwrite with new data

# -----------------------------------------------------------------------------
[job.LCMTrain]
# -----------------------------------------------------------------------------
max_epochs = 5
save_dir = "models/lcm/kcost"

# Model selection, picking "Auto" will give automatically select the model
# associated with the LSM design in the configuration file
model = "Auto"
no_checkpoint = false

# Different loss functions to train via
# MSE - Mean squared error
Expand All @@ -71,45 +113,27 @@ model = "Auto"
loss_fn = "MSE"

# Supported optimizers
# SGD - Stochastic gradient descent
# Adam
# Adagrad
# [SGD, Adam, Adagrad]
optimizer = "Adam"

# Learning rate schedulers
# [CosineAnnealing, Exponential, Constant, None]
lr_scheduler = "Constant"

# Stop checkpointing to improve training
no_checkpoint = false

[job.LCMTrain.train]
dir = "train-data/kcost-t30"
format = "parquet"
batch_size = 32
shuffle = true
num_workers = 2
drop_last = true

[job.LCMTrain.test]
dir = "test-data/kcost-t30"
format = "parquet"
batch_size = 1024
shuffle = false
num_workers = 4
drop_last = true

# -----------------------------------------------------------------------------
[job.LTuneDataGen]
# -----------------------------------------------------------------------------
format = "parquet"
dir = "test-data/ltune/std"
file_prefix = "wl"
num_workers = 4 # -1 forces all cores to be used
num_files = 2
samples = 1024 # per file sample
overwrite_if_exists = true

# -----------------------------------------------------------------------------
[job.LTuneTrain]
# -----------------------------------------------------------------------------
Expand All @@ -119,26 +143,25 @@ save_dir = "models/ltune/klsm"
# Learned cost model is our loss, input full path to checkpoint or model file
loss_fn_path = "models/lcm/kcost"

# Check train.optimizer for available options
# Optimizer settings in header.optimizer
# [SGD, Adam, Adagrad]
optimizer = "Adam"

# Learning rate schedulers
# Learning rate schedulers, settings in header.scheduler
# [CosineAnnealing, Exponential, Constant, None]
lr_scheduler = "Constant"

no_checkpoint = false

[job.LTuneTrain.train]
dir = "train-data/ltune/std"
format = "parquet"
batch_size = 2
shuffle = true
num_workers = 1
drop_last = true

[job.LTuneTrain.test]
dir = "test-data/ltune/std"
format = "parquet"
batch_size = 2
shuffle = false
num_workers = 1
Expand Down Expand Up @@ -198,40 +221,6 @@ z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER LSM
# Generic LSM settings including maximum bounds, system settings, starting
# budget for memory, number of elements, etc
# =============================================================================
[lsm]
# Design will effect everything else down stream (e.g. choice of neural network
# architecture for learned cost model)
# Tiering
# Leveling
# Classic - Considers both leveing and tiering
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
size_ratio_range = [2, 31] # low, high of size ratios to consider
page_sizes = [4, 8, 16] # KB pages
entry_sizes = [1024, 2048, 4096, 8192] # bits
memory_budget_range = [5, 20] # low, high, bits per element
selectivity_range = [1e-7, 1e-9] # low, high
elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same

# =============================================================================
# HEADER LCM
# Add configurations related to learned cost models
Expand All @@ -245,12 +234,8 @@ embedding_size = 8
hidden_length = 3
hidden_width = 32
decision_dim = 64

# Dropout percentage
dropout = 0.0

# Batch or Layer norm
norm_layer = "Batch"
dropout = 0.0 # dropout percentage
norm_layer = "Batch" # "Batch" or "Layer" norm

# Used only for classic models, generally smaller than embedding size
policy_embedding_size = 4
Expand Down Expand Up @@ -282,14 +267,9 @@ hard = true
[ltune.model]
hidden_length = 1
hidden_width = 64

# Dropout percentage
dropout = 0

# Batch or Layer norm
norm_layer = "Batch"

categorical_mode = "reinmax"
dropout = 0 # dropout percentage
norm_layer = "Batch" # batch or layer norm
categorical_mode = "reinmax" # reinmax or gumbel

# =============================================================================
# END LTUNE
Expand Down
22 changes: 9 additions & 13 deletions endure/ltune/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import glob
import numpy as np
import os
import pandas as pd
import pyarrow.parquet as pa
import torch
import torch.utils.data
Expand All @@ -13,33 +12,30 @@ class LTuneDataSet(torch.utils.data.IterableDataset):
def __init__(
self,
folder: str,
format: str = "parquet",
shuffle: bool = False,
) -> None:
self._format = format
self._fnames = glob.glob(os.path.join(folder, "*." + format))
self._fnames = glob.glob(os.path.join(folder, "*.parquet"))
self._shuffle = shuffle

def _get_input_cols(self):
return kINPUT_FEATS

def _load_data(self, fname):
if self._format == "parquet":
df = pa.read_table(fname).to_pandas()
else:
df = pd.read_csv(fname)
df = pa.read_table(fname).to_pandas()

return df

def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
if worker_info is None:
files = self._fnames
else:
file_bins = np.array_split(self._fnames, worker_info.num_workers)
files = self._fnames
if self._shuffle:
np.random.shuffle(files)

if worker_info is not None:
file_bins = np.array_split(files, worker_info.num_workers)
files = file_bins[worker_info.id]
if self._shuffle:
np.random.shuffle(files)

for file in files:
df = self._load_data(file)
inputs = torch.from_numpy(df[self._get_input_cols()].values).float()
Expand Down
20 changes: 9 additions & 11 deletions endure/ltune/data/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,23 @@

import numpy as np

from endure.lsm.types import System
from endure.lsm.types import LSMBounds, System
from endure.ltune.data.input_features import kSYSTEM_HEADER, kWORKLOAD_HEADER


class LTuneDataGenerator:
def __init__(
self,
page_sizes: List[int] = [4, 8, 16],
entry_sizes: List[int] = [1024, 2048, 4096, 8192],
memory_budget_range: Tuple[float, float] = (5.0, 20.0),
selectivity_range: Tuple[float, float] = (1e-7, 1e-9),
elements_range: Tuple[int, int] = (100000000, 1000000000),
bounds: LSMBounds,
precision: int = 3,
) -> None:
self.entry_sizes = entry_sizes
self.memory_budget_range = memory_budget_range
self.page_sizes = page_sizes
self.selectivity_range = selectivity_range
self.elements_range = elements_range
self.entry_sizes = bounds.entry_sizes
self.memory_budget_range = bounds.memory_budget_range
self.page_sizes = bounds.page_sizes
self.selectivity_range = bounds.selectivity_range
self.elements_range = bounds.elements_range

self.bounds = bounds
self.precision = precision

def _sample_workload(self, dimensions: int) -> list:
Expand Down
6 changes: 2 additions & 4 deletions endure/ltune/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import toml

from endure.lcm.model.builder import LearnedCostModelBuilder
from endure.lsm.types import STR_POLICY_DICT
from endure.lsm.types import Policy


class LearnedCostModelLoss(torch.nn.Module):
Expand All @@ -25,9 +25,7 @@ def __init__(self, config: dict[str, Any], model_path: str):
max_levels=lcm_cfg["lsm"]["max_levels"],
**lcm_cfg["lcm"]["model"],
)
lcm_model = STR_POLICY_DICT.get(lcm_cfg["lsm"]["design"], None)
if lcm_model is None:
raise TypeError(f"Illegal LCM model choice: {lcm_model=}")
lcm_model = getattr(Policy, lcm_cfg["lsm"]["design"])
self.model = self.lcm_builder.build_model(lcm_model)

data = torch.load(
Expand Down
Loading

0 comments on commit c8c18d2

Please sign in to comment.