-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
executable file
·105 lines (93 loc) · 4.65 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python3
# Process data collected by getopengldata-native
# and generate a standard data format.
import re
import pandas as pd
from pathlib import Path
DATA_ROW_PATTERN = r'^DEBUG: [0-9]+.[0-9]+: [0-9]+ [0-9]+ [0-9]+ [0-9]+; [0-9]+ [0-9]+ [0-9]+ [0-9]+; [0-9]+ [0-9]+ [0-9]+ [0-9]+;$'
BASE_DIR = Path('..') / 'misc_usenix_dataprocessing'
def is_valid_filetexts(input_textlines: str):
'''Determine whether the data read is valid'''
real_data_lines = input_textlines[:-1]
metadata_line = input_textlines[-1]
for data_line in real_data_lines:
if not re.match(DATA_ROW_PATTERN, data_line):
return False, ('data_line', data_line)
return True, tuple()
class ExperimentResult():
'''
Stores experiment result in two parts:
(1) Timestamp and 12 PC counter readings.
(2) Timestamp and the key pressed.
'''
def __init__(self, data_file_path, data_file_textlines):
self.df_data = pd.DataFrame()
self.df_press = pd.DataFrame()
self.filepath = str()
self.init_record(data_file_path, data_file_textlines)
return
def init_record(self, data_file_path, data_file_textlines):
self.filepath = data_file_path
# Now, convert textlines into pandas DataFrame
# Handle last line first
data_lines = data_file_textlines[:-1]
last_line = data_file_textlines[-1]
last_line_list = last_line.split(' ')
if last_line_list[-1] == '\n':
del last_line_list[-1]
key_press_times = len(last_line_list) // 2
assert key_press_times * 2 == len(last_line_list)
press_time_list = last_line_list[:key_press_times]
press_key_list = last_line_list[key_press_times:]
for i in range(key_press_times):
tmp_row = {'timestamp': press_time_list[i], 'key': press_key_list[i]}
tmp_df = pd.DataFrame(tmp_row, index=[0])
self.df_press = pd.concat([self.df_press, tmp_df], axis=0, ignore_index=True)
# Then, handle data lines
tmp_list = list()
for data_line in data_lines:
tmp_raw_dataline = data_line.split(':')[2]
tmp1, tmp2, tmp3, _ = tmp_raw_dataline.split(';')
PC1, PC2, PC3, PC4 = tmp1.strip().split(' ')
PC5, PC6, PC7, PC8 = tmp2.strip().split(' ')
PC9, PC10, PC11, PC12 = tmp3.strip().split(' ')
tmp_list.append({
'timestamp': str(data_line.split(':')[1].strip()),
'PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ': PC1, 'PERF_LRZ_FULL_8X8_TILES': PC2, 'PERF_LRZ_PARTIAL_8X8_TILES': PC3, 'PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ': PC4,
'PERF_RAS_SUPERTILE_ACTIVE_CYCLES': PC5, 'PERF_RAS_SUPER_TILES': PC6, 'PERF_RAS_8X4_TILES': PC7, 'PERF_RAS_FULLY_COVERED_8X4_TILES': PC9,
'PERF_VPC_PC_PRIMITIVES': PC9, 'PERF_VPC_SP_COMPONENTS': PC10, 'PERF_VPC_LRZ_ASSIGN_PRIMITIVES': PC11, 'PERF_VPC_SP_LM_COMPONENTS': PC12,
})
tmp_data_df = pd.DataFrame(tmp_list)
self.df_data = pd.concat([self.df_data, tmp_data_df], axis=0, ignore_index=True)
return
def save_as_csv(self, destdir):
'''Save current dataset into CSV files.'''
timestamp_str = self.filepath.name.split('output_')[1].split('.txt')[0]
assert re.match(r'^[0-9]+$', timestamp_str)
target_dir_path = destdir / timestamp_str
target_dir_path.mkdir(exist_ok=True)
data_csv_file_path = target_dir_path / '{}_data.csv'.format(timestamp_str)
keys_csv_file_path = target_dir_path / '{}_keys.csv'.format(timestamp_str)
self.df_data.to_csv(data_csv_file_path, sep=' ', index=False)
self.df_press.to_csv(keys_csv_file_path, sep=' ', index=False)
return
if __name__ == '__main__':
print('Hello world!')
# Find all files, using rglob()
possible_data_files = [x for x in BASE_DIR.rglob(r'output_*.txt')]
print('INFO: Candidate files No.: {}'.format(len(possible_data_files)))
for data_file in possible_data_files:
data_file_textlines = None
with open(data_file, 'r') as f:
data_file_textlines = f.readlines()
if not data_file_textlines:
raise Exception('data_file read failed!')
validation_result = is_valid_filetexts(data_file_textlines)
if validation_result[0] is False:
print('WARN: file {} is not valid ({}), skipping ...'.format(
data_file.name, str(validation_result[1])))
continue
# Now with valid text, try to record it
curr_data_record = ExperimentResult(data_file, data_file_textlines)
curr_data_record.save_as_csv(destdir=Path('.') / 'pc_counter_dataset')
print('.', end='')