diff --git a/docs/flow_models.elephants.skl.rst b/docs/flow_models.elephants.skl.rst index 5d57fd3..46dd0e9 100644 --- a/docs/flow_models.elephants.skl.rst +++ b/docs/flow_models.elephants.skl.rst @@ -5,33 +5,28 @@ flow\_models.elephants.skl.plot\_classifiers module --------------------------------------------------- .. automodule:: flow_models.elephants.skl.plot_classifiers - :members: :show-inheritance: flow\_models.elephants.skl.plot\_regressors module -------------------------------------------------- .. automodule:: flow_models.elephants.skl.plot_regressors - :members: :show-inheritance: flow\_models.elephants.skl.train\_classifiers module ---------------------------------------------------- .. automodule:: flow_models.elephants.skl.train_classifiers - :members: :show-inheritance: flow\_models.elephants.skl.train\_regressors module --------------------------------------------------- .. automodule:: flow_models.elephants.skl.train_regressors - :members: :show-inheritance: flow\_models.elephants.skl.tune module -------------------------------------- .. automodule:: flow_models.elephants.skl.tune - :members: :show-inheritance: diff --git a/flow_models/elephants/plot_entropy.py b/flow_models/elephants/plot_entropy.py index c63aad4..b792f0d 100644 --- a/flow_models/elephants/plot_entropy.py +++ b/flow_models/elephants/plot_entropy.py @@ -32,7 +32,7 @@ def calculate_entropy(directory): Returns ------- - {"bytes": [], "bits": []} + dict entropy for subsequent bytes and bits of (sa, da, sp, dp, prot) fields """ diff --git a/flow_models/elephants/simulate_data.py b/flow_models/elephants/simulate_data.py index 25a8662..0f30165 100644 --- a/flow_models/elephants/simulate_data.py +++ b/flow_models/elephants/simulate_data.py @@ -44,8 +44,14 @@ def simulate_data(directory, index=Ellipsis, mask=None, pps=None, fps=None, time Returns ------- - int, int, np.array, np.array - flows_sum, octets_sum, flows_slots, octets_slots + flows_sum : int + sum of flows added to flow table + octets_sum : int + sum of octets transmitted by flows while being in flow table + flows_slots : np.array + number of flows present in flow table in each second + octets_slots : np.array + amount of octets trasmitted by flows in flow table in each second """ d = pathlib.Path(directory) diff --git a/flow_models/elephants/skl/train_classifiers.py b/flow_models/elephants/skl/train_classifiers.py index 00eadeb..7a01548 100644 --- a/flow_models/elephants/skl/train_classifiers.py +++ b/flow_models/elephants/skl/train_classifiers.py @@ -32,17 +32,17 @@ class Data: def parser(): p = argparse.ArgumentParser(description=__doc__) - p.add_argument('-O', '--output', default='sklearn', help='output directory') + p.add_argument('-O', '--output', default='sklearn', help='results output directory') p.add_argument('--seed', type=int, default=None, help='seed') - p.add_argument('--fork', action='store_true', help='') - p.add_argument('--jobs', type=int, default=1, help='') - p.add_argument('files', help='directory') + p.add_argument('--fork', action='store_true', help='fork to subprocess for each simulation') + p.add_argument('--jobs', type=int, default=1, help='maximum number of simultaneous subprocesses') + p.add_argument('directory', help='binary flow records directory') return p def main(): app_args = parser().parse_args() jobs = set() - data = load_arrays(app_args.files) + data = load_arrays(app_args.directory) results = collections.defaultdict(lambda: collections.defaultdict(list)) decisions_true = collections.defaultdict(list) decisions_predicted = collections.defaultdict(list) @@ -73,8 +73,8 @@ def main(): for n, (train_index, test_index) in enumerate(sklearn.model_selection.KFold(data_par.get('folds', 5)).split(all_octets)): logmsg(f"Folding {n}") train_octets, test_octets = all_octets[train_index], all_octets[test_index] - train_flows_sum, train_octets_sum, train_flows_slots, train_octets_slots = simulate_data(app_args.files, index=train_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) - test_flows_sum, test_octets_sum, test_flows_slots, test_octets_slots = simulate_data(app_args.files, index=test_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) + train_flows_sum, train_octets_sum, train_flows_slots, train_octets_slots = simulate_data(app_args.directory, index=train_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) + test_flows_sum, test_octets_sum, test_flows_slots, test_octets_slots = simulate_data(app_args.directory, index=test_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) for prep_par in prep_params: logmsg(f"Preparing {prep_par} {n}") prepared_inp = prepare_input(all_inp, **prep_par) @@ -136,7 +136,7 @@ def main(): decision_predicted = clf.predict(inp) decisions_true[f'{name} ({mode})'].append(np.packbits(decision_true)) decisions_predicted[f'{name} ({mode})'].append(np.packbits(decision_predicted)) - this_flows_sum, this_octets_sum, this_flows_slots, this_octets_slots = simulate_data(app_args.files, index=index, mask=decision_predicted, pps=PPS, fps=FPS, timeout=TIMEOUT) + this_flows_sum, this_octets_sum, this_flows_slots, this_octets_slots = simulate_data(app_args.directory, index=index, mask=decision_predicted, pps=PPS, fps=FPS, timeout=TIMEOUT) c = itertools.count() results[f'{name} ({mode})'][next(c)].append(training_coverage) results[f'{name} ({mode})'][next(c)].append(octets[decision_predicted].sum() / octets.sum()) diff --git a/flow_models/elephants/skl/train_regressors.py b/flow_models/elephants/skl/train_regressors.py index a8a5ef0..33488d2 100644 --- a/flow_models/elephants/skl/train_regressors.py +++ b/flow_models/elephants/skl/train_regressors.py @@ -33,17 +33,17 @@ class Data: def parser(): p = argparse.ArgumentParser(description=__doc__) - p.add_argument('-O', '--output', default='sklearn', help='output directory') + p.add_argument('-O', '--output', default='sklearn', help='results output directory') p.add_argument('--seed', type=int, default=None, help='seed') - p.add_argument('--fork', action='store_true', help='') - p.add_argument('--jobs', type=int, default=1, help='') - p.add_argument('files', help='directory') + p.add_argument('--fork', action='store_true', help='fork to subprocess for each simulation') + p.add_argument('--jobs', type=int, default=1, help='maximum number of simultaneous subprocesses') + p.add_argument('directory', help='binary flow records directory') return p def main(): app_args = parser().parse_args() jobs = set() - data = load_arrays(app_args.files) + data = load_arrays(app_args.directory) results = collections.defaultdict(lambda: collections.defaultdict(list)) predictions = {} decisions_true = collections.defaultdict(list) @@ -74,8 +74,8 @@ def main(): for n, (train_index, test_index) in enumerate(sklearn.model_selection.KFold(data_par.get('folds', 5)).split(all_octets)): logmsg(f"Folding {n}") train_octets, test_octets = all_octets[train_index], all_octets[test_index] - train_flows_sum, train_octets_sum, train_flows_slots, train_octets_slots = simulate_data(app_args.files, index=train_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) - test_flows_sum, test_octets_sum, test_flows_slots, test_octets_slots = simulate_data(app_args.files, index=test_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) + train_flows_sum, train_octets_sum, train_flows_slots, train_octets_slots = simulate_data(app_args.directory, index=train_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) + test_flows_sum, test_octets_sum, test_flows_slots, test_octets_slots = simulate_data(app_args.directory, index=test_index, mask=None, pps=PPS, fps=FPS, timeout=TIMEOUT) for prep_par in prep_params: logmsg(f"Preparing {prep_par} {n}") prepared_inp = prepare_input(all_inp, **prep_par) @@ -136,7 +136,7 @@ def main(): thresholds = np.logspace(0, 24, 64, base=2) * 64 for threshold in thresholds: decision_predicted = octets_predicted > threshold - sim_results.append(pool.apply_async(simulate_data, (app_args.files, index, decision_predicted, PPS, FPS, TIMEOUT))) + sim_results.append(pool.apply_async(simulate_data, (app_args.directory, index, decision_predicted, PPS, FPS, TIMEOUT))) for i, threshold in enumerate(thresholds): logmsg(f"Evaluating {name} mode: {mode} threshold: {threshold}") decision_true = octets > threshold