-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
282 lines (262 loc) · 12.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import sys
import getopt
import pandas as pd
import numpy as np
import utils.utils as ut
from loader.distance_mtr import DiffMatrix
from dominance.dominance_tools import RFDDiscovery
"""Main module used to execute the algorithm in the command line"""
def main(args):
"""
This method starts the rfd-discovery algorithm. It takes various command line parameters like a valid dataset's
path, the division on rhs and lhs needed and more. See the section Usage of the README for
more informations about the available parameters. If the user does not give a valid sequence of
parameters, the program will end and print on the standard output a message with the required
format to run the program.
:param args: list of parameters given as input
:type args: list
"""
c_sep, csv_file, has_header, semantic, has_dt, missing, index_col, human = extract_args(args)
try:
check_correctness(has_dt, hss, index_col)
except getopt.GetoptError as gex:
usage()
print(str(gex))
sys.exit(1)
except AssertionError as aex:
usage()
print(str(aex))
sys.exit(1)
if hss is None:
usage()
if isinstance(hss, list):
with ut.timeit_context("Whole time"):
with ut.timeit_context("Distance time"):
if isinstance(has_header, int) and not has_header:
diff_mtx = DiffMatrix(csv_file,
sep=c_sep,
index_col=index_col,
semantic=semantic,
missing=missing,
datetime=has_dt)
else:
diff_mtx = DiffMatrix(csv_file,
sep=c_sep,
first_col_header=has_header,
semantic=semantic,
index_col=index_col,
missing=missing,
datetime=has_dt)
for combination in hss:
comb_dist_mtx = diff_mtx.split_sides(combination)
with ut.timeit_context("RFD Discover time for {}".format(str(combination))):
nd = RFDDiscovery(comb_dist_mtx)
if human:
print(combination)
print_human(nd.get_rfds(nd.standard_algorithm, combination))
else:
print(nd.get_rfds(nd.standard_algorithm, combination))
def print_human(df: pd.DataFrame):
"""
Given a valid pandas data frame containing the found RFDs, it prints this RFDs on the standard output in a human readable
form using the following format:
attr1(<=threshold1),..., attrn(<=thresholdn) -> RHS(<=rhs_threshold)
:param df: data frame containing the RFDs
:type df: pandas.core.frame.DataFrame
"""
string = ""
for index, row in df.iterrows():
string += "".join(["" if np.isnan(row[i]) else "{}(<= {}), ".format(df.columns[i], round(row[i], ndigits=2))
for i in range(1, len(row))])
string += "-> {}(<= {})".format(df.columns[0], round(row[0], ndigits=2))
string += "\n"
print(string)
def extract_args(args):
"""
Given the list of command line parameters, it extracts the parameters given according to the format
described in the Usage section of the README.
If some parameter cannot be interpreted, then the function will raise an AssertionError.
If the path of the CSV is missing or is not valid, the program will print an error message and it will end.
With the help option, it will print on the standard output the help about the execution of this program.
:param args: list of command line argument given at the startup
:type args: list
:return: list of parameters extracted
:rtype: tuple
:raise: AssertionError
"""
try:
# Default values
c_sep, has_header, semantic, has_dt, missing, ic, human = '', 0, True, False, "?", False, False
csv_file = ''
lhs = []
rhs = []
opts, args = getopt.getopt(args, "c:r:l:s:hm:d:vi:", ["semantic", "help", "human"])
for opt, arg in opts:
if opt == '-v':
print("rdf-discovery version 0.0.1")
sys.exit(0)
if opt == '-c':
csv_file = arg
elif opt == '-r':
rhs = [int(arg)]
if len(rhs) > 1:
print("You can specify at most 1 RHS attribute")
sys.exit(-1)
elif opt == '-l':
lhs = [int(_) for _ in arg.split(',')]
elif opt == '-s':
c_sep = arg
elif opt == '-h':
has_header = 0
elif opt == '--semantic':
semantic = True
elif opt == '-m':
missing = arg
elif opt == '-d':
has_dt = [int(_) for _ in arg.split(',')]
elif opt == '-i':
ic = int(arg)
elif opt == '--human':
human = True
elif opt == '--help':
usage()
sys.exit(0)
else:
assert False, "unhandled option"
except getopt.GetoptError as getopt_err:
print(getopt_err)
usage()
sys.exit(2)
except TypeError as t_err:
print("Error while trying to convert a string to numeric: {}".format(str(t_err)))
sys.exit(-1)
except Exception as ex:
print("Error while trying to extract arguments: {}".format(str(ex)))
sys.exit(-1)
# understanding
try:
c_sep_, has_header_ = extract_sep_n_header(c_sep, csv_file, has_header)
if c_sep == '':
c_sep = c_sep_
if has_header is None:
has_header = has_header_
cols_count = ut.get_cols_count(csv_file, c_sep)
global hss
hss = extract_hss(cols_count, lhs, rhs)
except Exception as ex:
print("Error while trying to understand arguments: {}".format(str(ex)))
sys.exit(-1)
return c_sep, csv_file, has_header, semantic, has_dt, missing, ic, human
def extract_hss(cols_count, lhs, rhs):
# You cannot have len(rhs) > 1, don't check it
"""
Given the lhs and rhs from the command line parameters, and the column's number of the dataset,
it creates various combinations of rhs and lhs according to the format of this two parameters.
If the format of this two parameters is not accordant with the possible combination on rhs and lhs in the
command line arguments described by the README, the program will print an error message and it will end.
The program returns a list of dict, where each dict contains the indexes of the attributes on the lhs with the key
'lhs' and the index of the attribute on the rhs with the key 'rhs'.
:param cols_count: the column's number
:type cols_count: int
:param lhs: list of a valid columns' indexes containing the dataset's attributes positioned in the lhs
:type lhs: list
:param rhs: list of a valid column's index containing the dataset's attribute positioned in the rhs
:type rhs: list
:return: one or more combination of attribute in the rhs and lhs
:rtype: list
"""
if rhs == [] and lhs == []: # each combination case
hss = ut.get_hs_combination(cols_count)
elif rhs == [] and not lhs == []: # error case
print("You have to specify at least one RHS attribute")
sys.exit(-1)
elif not rhs == [] and lhs == []: # only rhs specified case
cols_index = list(range(cols_count))
if not rhs[0] in cols_index:
print("RHS index is out of bound. Specify a valid value")
sys.exit(-1)
hss = list()
hss.append({'rhs': rhs, 'lhs': cols_index[:rhs[0]] + cols_index[rhs[0] + 1:]})
else:
hss = list()
hss.append({'rhs': rhs, 'lhs': lhs})
return hss
def extract_sep_n_header(c_sep, csv_file, has_header):
"""
Given a correct path to a CSV file containing the dataset, the separator and the presence or not of the header
given by the command line arguments, this function will try to infer the separator and/or the presence
of the header in the dataset if they was not specified in the command line arguments.
:param c_sep: the separator extracted from the command line argument
:type c_sep: str
:param csv_file: a correct path to a CSV file containing a valid dataset
:type csv_file: str
:param has_header: indicate the presence or not of a column header in the CSV
:type has_header: int
:return: the separator used in the CSV and the value 0 if the CSV has an header, None otherwise
:rtype: tuple
"""
if c_sep == '' and has_header is None:
c_sep, has_header = ut.check_sep_n_header(csv_file)
elif c_sep != '' and has_header is None:
has_header = ut.check_sep_n_header(csv_file)[1]
elif c_sep == '' and has_header is not None:
c_sep = ut.check_sep_n_header(csv_file)[0]
return c_sep, has_header
def check_correctness(has_dt, hss, index_col):
"""
Verify the correctness of the columns' indexes given for the division in
rhs and lhs and of the index indicating the key's column.
If some index is out of bound, the function will raise an getopt.GetoptError.
If there are some indexes repeated, the program will raise an AssertionError.
:param has_dt: value containing indexes of columns containing date, False otherwise
:type has_dt: list or bool
:param hss: dict containing the division in rhs and lhs
:type hss: dict
:param index_col: index of the column containing the dataset's primary key
:type index_col: int
:raises getopt.GetoptError, AssertionError
"""
max_index = max(hss[0]['rhs'] + hss[0]['lhs'])
unique_index_count = sum([1 for i in set(hss[0]['rhs'] + hss[0]['lhs'])])
# check has header
if index_col:
if index_col < 0:
raise AssertionError("Index of a column cannot be less than 0")
if index_col > max_index and len(hss[0]['lhs']) >= 1:
raise getopt.GetoptError("Index col is out of bound")
# check date
if has_dt is not False:
if max(has_dt) > max_index and len(hss[0]['lhs']) >= 1:
raise getopt.GetoptError("Datetime index is out of bound")
# check non repeated index
index_count = sum([1 for i in hss[0]['rhs'] + hss[0]['lhs']])
if index_count != unique_index_count:
raise AssertionError("Repeated index error")
def usage():
"""
Print the usage message, describing the correct format for a correct program execution. For each
argument in the command line arguments, the message will show a short description of it.
"""
usage_str = """
python {} -c <csv-file> [options]
-c <your-csv>: is the path of the dataset on which you want to discover RFDs;
Options:
-v : display version number;
-s <sep>: the separation char used in your CSV file. If you don't provide this, rfd-discovery tries to infer it for you;
-h: Indicate that CSV file has the header row. If you don't provide this, rdf-discovery tries to infer it for you.
-r <rhs_index>: is the column number of the RHS attribute. It must be a valid integer.
You can avoid to specify it only if you don't specify LHS attributes (we'll find RFDs using each attribute as RHS and the remaining as LHS);
-l <lhs_index_1, lhs_index_2, ...,lhs_index_k>: column index of LHS' attributes indexes separated by commas (e.g. 1,2,3). You can avoid to specify them:
if you don't specify RHS' attribute index we'll find RFDs using each attribute as RHS and the remaining as LHS;
if you specify a valid RHS index we'll assume your LHS as the remaining attributes;
-i <index_col>: the column which contains the primary key of the dataset.
Specifying it, this will not calculate as distance. NOTE: index column should contains unique values;
-d <datetime columns>: a list of columns, separated by commas, which values are in datetime format.
Specifying this, rfd-discovery can depict distance between two date in time format (e.g. ms, sec, min);
--semantic: use semantic distance on Wordnet for string; For more info here;
--human: print the RFDs to the standard output in a human readable form;
--help: show help. \n
"""
print(usage_str.format(sys.argv[0]))
if __name__ == "__main__":
main(sys.argv[1:])