Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Refactor] Clean up jobs with toml file #22

Merged
merged 6 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions endure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import toml
import sys

from jobs.lcm_data_gen import LCMDataGenJob
from jobs.lcm_train import LCMTrainJob
from jobs.ltune_data_gen import LTuneDataGenJob
from jobs.data_gen import DataGenJob
from jobs.ltune_train import LTuneTrainJob
from jobs.bayesian_pipeline import BayesianPipeline

Expand All @@ -27,9 +26,8 @@ def run(self):
self.log.info(f'Staring app {self.config["app"]["name"]}')

jobs = {
"LCMDataGen": LCMDataGenJob,
"DataGen": DataGenJob,
"LCMTrain": LCMTrainJob,
"LTuneDataGen": LTuneDataGenJob,
"LTuneTrain": LTuneTrainJob,
"BayesianBaseline": BayesianPipeline,
}
Expand Down
154 changes: 67 additions & 87 deletions endure.toml
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
# =============================================================================
# ENDURE Configuration File
# Every job will contain it's own HEADER <Job Title> with settings appropiate
# for the job.
#
# Following subsections are available
# APP
# LOGGER - output setting
# IO - base directory for IO
# LSM - Log structured merge tree assumptions and settings
# JOB - all job specific settings
# LCM - Learned cost model specifics
# LTune - Learned tuner specifics
# SCHEDULERS - ML learning rate schduler kwargs
# OPTIMIZERS - ML optimizer kwargs
# LOSS - ML Loss function kwargs
# =============================================================================

# =============================================================================
# HEADER APP
# Logic of app including jobs list to run
# =============================================================================
[app]
name = "ENDURE"
run = [
# "LCMDataGen",
# "DataGen",
# "LCMTrain",
# "LTuneDataGen",
# "LTuneTrain",
# "BayesianBaseline"
]
Expand All @@ -34,6 +44,40 @@ disable_tqdm = false
[io]
data_dir = "/data"

# =============================================================================
# HEADER LSM
# Generic LSM settings including maximum bounds, system settings, starting
# budget for memory, number of elements, etc
# =============================================================================
[lsm]
# Design will effect everything else down stream (e.g. choice of neural network
# architecture for learned cost model)
# Tiering
# Leveling
# Classic - Considers both leveing and tiering
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
size_ratio_range = [2, 31] # low, high of size ratios to consider
page_sizes = [4, 8, 16] # KB pages
entry_sizes = [1024, 2048, 4096, 8192] # bits
memory_budget_range = [5, 20] # low, high, bits per element
selectivity_range = [1e-7, 1e-9] # low, high
elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same

# =============================================================================
# HEADER JOB
# Settings for each individual job (executable)
Expand All @@ -42,24 +86,22 @@ data_dir = "/data"
use_gpu_if_avail = false

# -----------------------------------------------------------------------------
[job.LCMDataGen]
[job.DataGen]
# -----------------------------------------------------------------------------
dir = "test-data/kcost-t30"
file_prefix = "kcost"
num_workers = -1 # -1 forces all cores to be used
num_files = 2
samples = 1024 # per file sample
overwrite_if_exists = true
dir = "lcm/test/std"
generator = "LTuner" # Select between data for tuner (LTuner) or LCM
file_prefix = "tuner" # all files named file_prefix_000X.parquet
num_workers = -1 # -1 forces all cores to be used
num_files = 2 # number of files to generate
samples = 1024 # per file sample
overwrite_if_exists = true # if files exist overwrite with new data

# -----------------------------------------------------------------------------
[job.LCMTrain]
# -----------------------------------------------------------------------------
max_epochs = 5
save_dir = "models/lcm/kcost"

# Model selection, picking "Auto" will give automatically select the model
# associated with the LSM design in the configuration file
model = "Auto"
no_checkpoint = false

# Different loss functions to train via
# MSE - Mean squared error
Expand All @@ -71,45 +113,27 @@ model = "Auto"
loss_fn = "MSE"

# Supported optimizers
# SGD - Stochastic gradient descent
# Adam
# Adagrad
# [SGD, Adam, Adagrad]
optimizer = "Adam"

# Learning rate schedulers
# [CosineAnnealing, Exponential, Constant, None]
lr_scheduler = "Constant"

# Stop checkpointing to improve training
no_checkpoint = false

[job.LCMTrain.train]
dir = "train-data/kcost-t30"
format = "parquet"
batch_size = 32
shuffle = true
num_workers = 2
drop_last = true

[job.LCMTrain.test]
dir = "test-data/kcost-t30"
format = "parquet"
batch_size = 1024
shuffle = false
num_workers = 4
drop_last = true

# -----------------------------------------------------------------------------
[job.LTuneDataGen]
# -----------------------------------------------------------------------------
format = "parquet"
dir = "test-data/ltune/std"
file_prefix = "wl"
num_workers = 4 # -1 forces all cores to be used
num_files = 2
samples = 1024 # per file sample
overwrite_if_exists = true

# -----------------------------------------------------------------------------
[job.LTuneTrain]
# -----------------------------------------------------------------------------
Expand All @@ -119,26 +143,25 @@ save_dir = "models/ltune/klsm"
# Learned cost model is our loss, input full path to checkpoint or model file
loss_fn_path = "models/lcm/kcost"

# Check train.optimizer for available options
# Optimizer settings in header.optimizer
# [SGD, Adam, Adagrad]
optimizer = "Adam"

# Learning rate schedulers
# Learning rate schedulers, settings in header.scheduler
# [CosineAnnealing, Exponential, Constant, None]
lr_scheduler = "Constant"

no_checkpoint = false

[job.LTuneTrain.train]
dir = "train-data/ltune/std"
format = "parquet"
batch_size = 2
shuffle = true
num_workers = 1
drop_last = true

[job.LTuneTrain.test]
dir = "test-data/ltune/std"
format = "parquet"
batch_size = 2
shuffle = false
num_workers = 1
Expand Down Expand Up @@ -198,40 +221,6 @@ z1 = 0.190
q = 0.545
w = 0.202

# =============================================================================
# HEADER LSM
# Generic LSM settings including maximum bounds, system settings, starting
# budget for memory, number of elements, etc
# =============================================================================
[lsm]
# Design will effect everything else down stream (e.g. choice of neural network
# architecture for learned cost model)
# Tiering
# Leveling
# Classic - Considers both leveing and tiering
# QFixed - Levels 1 -> L = Q
# YZHybrid - Levels 1 -> (L-1) = Q, Level L = Z
# KHybrid - Each level has own K_i decision
design = 'KHybrid'

[lsm.bounds]
max_considered_levels = 20 # Max number of levels to consider
size_ratio_range = [2, 31] # low, high of size ratios to consider
page_sizes = [4, 8, 16] # KB pages
entry_sizes = [1024, 2048, 4096, 8192] # bits
memory_budget_range = [5, 20] # low, high, bits per element
selectivity_range = [1e-7, 1e-9] # low, high
elements_range = [100000000, 1000000000] # element range

# Default system values if not generating random systems
[lsm.system]
E = 8192 # size of a single entry in bits
s = 2e-7 # range query selectivity, 1 implies the full key range per query
B = 4 # number of physical entries per page
N = 1000000000 # total number of key-val pairs for LSM tree
H = 10 # total memory budget in bits per element
phi = 1 # read/write asymmetry coefficient, 1 implies w/r cost the same

# =============================================================================
# HEADER LCM
# Add configurations related to learned cost models
Expand All @@ -245,12 +234,8 @@ embedding_size = 8
hidden_length = 3
hidden_width = 32
decision_dim = 64

# Dropout percentage
dropout = 0.0

# Batch or Layer norm
norm_layer = "Batch"
dropout = 0.0 # dropout percentage
norm_layer = "Batch" # "Batch" or "Layer" norm

# Used only for classic models, generally smaller than embedding size
policy_embedding_size = 4
Expand Down Expand Up @@ -282,14 +267,9 @@ hard = true
[ltune.model]
hidden_length = 1
hidden_width = 64

# Dropout percentage
dropout = 0

# Batch or Layer norm
norm_layer = "Batch"

categorical_mode = "reinmax"
dropout = 0 # dropout percentage
norm_layer = "Batch" # batch or layer norm
categorical_mode = "reinmax" # reinmax or gumbel

# =============================================================================
# END LTUNE
Expand Down
22 changes: 9 additions & 13 deletions endure/ltune/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import glob
import numpy as np
import os
import pandas as pd
import pyarrow.parquet as pa
import torch
import torch.utils.data
Expand All @@ -13,33 +12,30 @@ class LTuneDataSet(torch.utils.data.IterableDataset):
def __init__(
self,
folder: str,
format: str = "parquet",
shuffle: bool = False,
) -> None:
self._format = format
self._fnames = glob.glob(os.path.join(folder, "*." + format))
self._fnames = glob.glob(os.path.join(folder, "*.parquet"))
self._shuffle = shuffle

def _get_input_cols(self):
return kINPUT_FEATS

def _load_data(self, fname):
if self._format == "parquet":
df = pa.read_table(fname).to_pandas()
else:
df = pd.read_csv(fname)
df = pa.read_table(fname).to_pandas()

return df

def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
if worker_info is None:
files = self._fnames
else:
file_bins = np.array_split(self._fnames, worker_info.num_workers)
files = self._fnames
if self._shuffle:
np.random.shuffle(files)

if worker_info is not None:
file_bins = np.array_split(files, worker_info.num_workers)
files = file_bins[worker_info.id]
if self._shuffle:
np.random.shuffle(files)

for file in files:
df = self._load_data(file)
inputs = torch.from_numpy(df[self._get_input_cols()].values).float()
Expand Down
20 changes: 9 additions & 11 deletions endure/ltune/data/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,23 @@

import numpy as np

from endure.lsm.types import System
from endure.lsm.types import LSMBounds, System
from endure.ltune.data.input_features import kSYSTEM_HEADER, kWORKLOAD_HEADER


class LTuneDataGenerator:
def __init__(
self,
page_sizes: List[int] = [4, 8, 16],
entry_sizes: List[int] = [1024, 2048, 4096, 8192],
memory_budget_range: Tuple[float, float] = (5.0, 20.0),
selectivity_range: Tuple[float, float] = (1e-7, 1e-9),
elements_range: Tuple[int, int] = (100000000, 1000000000),
bounds: LSMBounds,
precision: int = 3,
) -> None:
self.entry_sizes = entry_sizes
self.memory_budget_range = memory_budget_range
self.page_sizes = page_sizes
self.selectivity_range = selectivity_range
self.elements_range = elements_range
self.entry_sizes = bounds.entry_sizes
self.memory_budget_range = bounds.memory_budget_range
self.page_sizes = bounds.page_sizes
self.selectivity_range = bounds.selectivity_range
self.elements_range = bounds.elements_range

self.bounds = bounds
self.precision = precision

def _sample_workload(self, dimensions: int) -> list:
Expand Down
6 changes: 2 additions & 4 deletions endure/ltune/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import toml

from endure.lcm.model.builder import LearnedCostModelBuilder
from endure.lsm.types import STR_POLICY_DICT
from endure.lsm.types import Policy


class LearnedCostModelLoss(torch.nn.Module):
Expand All @@ -25,9 +25,7 @@ def __init__(self, config: dict[str, Any], model_path: str):
max_levels=lcm_cfg["lsm"]["max_levels"],
**lcm_cfg["lcm"]["model"],
)
lcm_model = STR_POLICY_DICT.get(lcm_cfg["lsm"]["design"], None)
if lcm_model is None:
raise TypeError(f"Illegal LCM model choice: {lcm_model=}")
lcm_model = getattr(Policy, lcm_cfg["lsm"]["design"])
self.model = self.lcm_builder.build_model(lcm_model)

data = torch.load(
Expand Down
Loading
Loading