1
- from typing import Any , Union
2
- import logging
1
+ from typing import List , Tuple , Union
3
2
4
3
import numpy as np
5
4
6
- from endure .lsm .types import LSMDesign , System , Policy
5
+ from endure .lsm .types import System
6
+ from endure .ltune .data .input_features import kSYSTEM_HEADER , kWORKLOAD_HEADER
7
7
8
8
9
- class LTuneGenerator :
9
+ class LTuneDataGenerator :
10
10
def __init__ (
11
11
self ,
12
- config : dict [str , Any ],
13
- format : str = "parquet" ,
12
+ page_sizes : List [int ] = [4 , 8 , 16 ],
13
+ entry_sizes : List [int ] = [1024 , 2048 , 4096 , 8192 ],
14
+ memory_budget_range : Tuple [float , float ] = (5.0 , 20.0 ),
15
+ selectivity_range : Tuple [float , float ] = (1e-7 , 1e-9 ),
16
+ elements_range : Tuple [int , int ] = (100000000 , 1000000000 ),
14
17
precision : int = 3 ,
15
18
) -> None :
16
- self .log = logging .getLogger (config ["log" ]["name" ])
17
- self ._config = config
18
- self ._header = self ._gen_workload_header () + self ._gen_system_header ()
19
- self .format = format
19
+ self .entry_sizes = entry_sizes
20
+ self .memory_budget_range = memory_budget_range
21
+ self .page_sizes = page_sizes
22
+ self .selectivity_range = selectivity_range
23
+ self .elements_range = elements_range
20
24
self .precision = precision
21
25
22
26
def _sample_workload (self , dimensions : int ) -> list :
@@ -31,25 +35,25 @@ def _sample_workload(self, dimensions: int) -> list:
31
35
# TODO: Will want to configure environment to simulate larger ranges over
32
36
# potential system values
33
37
def _sample_entry_per_page (self , entry_size : int = 8192 ) -> int :
38
+ # Potential page sizes are 4KB, 8KB, 16KB
34
39
KB_TO_BITS = 8 * 1024
35
- page_sizes = np .array (self ._config [ "generator" ][ " page_sizes" ] )
40
+ page_sizes = np .array (self .page_sizes )
36
41
entries_per_page = (page_sizes * KB_TO_BITS ) / entry_size
37
42
return np .random .choice (entries_per_page )
38
43
39
44
def _sample_selectivity (self ) -> float :
40
- low , high = self ._config [ "generator" ][ " selectivity_range" ]
45
+ low , high = self .selectivity_range
41
46
return (high - low ) * np .random .rand () + low
42
47
43
48
def _sample_entry_size (self ) -> int :
44
- choices = self ._config ["generator" ]["entry_sizes" ]
45
- return np .random .choice (choices )
49
+ return np .random .choice (self .entry_sizes )
46
50
47
51
def _sample_memory_budget (self ) -> float :
48
- low , high = self ._config [ "generator" ][ "memory_budget" ]
52
+ low , high = self .memory_budget_range
49
53
return (high - low ) * np .random .rand () + low
50
54
51
55
def _sample_total_elements (self ) -> int :
52
- low , high = self ._config [ "generator" ][ " elements_range" ]
56
+ low , high = self .elements_range
53
57
return np .random .randint (low = low , high = high )
54
58
55
59
def _sample_system (self ) -> System :
@@ -63,10 +67,10 @@ def _sample_system(self) -> System:
63
67
return system
64
68
65
69
def _gen_system_header (self ) -> list :
66
- return [ "B" , "s" , "E" , "H" , "N" ]
70
+ return kSYSTEM_HEADER
67
71
68
72
def _gen_workload_header (self ) -> list :
69
- return [ "z0" , "z1" , "q" , "w" ]
73
+ return kWORKLOAD_HEADER
70
74
71
75
def generate_header (self ) -> list :
72
76
return self ._gen_workload_header () + self ._gen_system_header ()
@@ -89,22 +93,11 @@ def generate_row_csv(self) -> list:
89
93
90
94
return line
91
95
92
- def generate_row_parquet (self ) -> dict [str , Union [int , float ]]:
96
+ def generate_row (self ) -> dict [str , Union [int , float ]]:
93
97
header = self .generate_header ()
94
98
row = self .generate_row_csv ()
95
99
line = {}
96
100
for key , val in zip (header , row ):
97
101
line [key ] = val
98
102
99
103
return line
100
-
101
- def generate_row (
102
- self ,
103
- row_type : str = "parquet"
104
- ) -> Union [list , dict [str , Union [int , float ]]]:
105
- if row_type == "parquet" :
106
- row = self .generate_row_parquet ()
107
- else : # format == 'csv'
108
- row = self .generate_row_csv ()
109
-
110
- return row
0 commit comments