forked from IBM/Multi-GNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
format_kaggle_files.py
79 lines (58 loc) · 2.08 KB
/
format_kaggle_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import datatable as dt
from datetime import datetime
from datatable import f,join,sort
import sys
import os
n = len(sys.argv)
if n == 1:
print("No input path")
sys.exit()
inPath = sys.argv[1]
outPath = os.path.dirname(inPath) + "/formatted_transactions.csv"
raw = dt.fread(inPath, columns = dt.str32)
currency = dict()
paymentFormat = dict()
bankAcc = dict()
account = dict()
def get_dict_val(name, collection):
if name in collection:
val = collection[name]
else:
val = len(collection)
collection[name] = val
return val
header = "EdgeID,from_id,to_id,Timestamp,\
Amount Sent,Sent Currency,Amount Received,Received Currency,\
Payment Format,Is Laundering\n"
firstTs = -1
with open(outPath, 'w') as writer:
writer.write(header)
for i in range(raw.nrows):
datetime_object = datetime.strptime(raw[i,"Timestamp"], '%Y/%m/%d %H:%M')
ts = datetime_object.timestamp()
day = datetime_object.day
month = datetime_object.month
year = datetime_object.year
hour = datetime_object.hour
minute = datetime_object.minute
if firstTs == -1:
startTime = datetime(year, month, day)
firstTs = startTime.timestamp() - 10
ts = ts - firstTs
cur1 = get_dict_val(raw[i,"Receiving Currency"], currency)
cur2 = get_dict_val(raw[i,"Payment Currency"], currency)
fmt = get_dict_val(raw[i,"Payment Format"], paymentFormat)
fromAccIdStr = raw[i,"From Bank"] + raw[i,2]
fromId = get_dict_val(fromAccIdStr, account)
toAccIdStr = raw[i,"To Bank"] + raw[i,4]
toId = get_dict_val(toAccIdStr, account)
amountReceivedOrig = float(raw[i,"Amount Received"])
amountPaidOrig = float(raw[i,"Amount Paid"])
isl = int(raw[i,"Is Laundering"])
line = '%d,%d,%d,%d,%f,%d,%f,%d,%d,%d\n' % \
(i,fromId,toId,ts,amountPaidOrig,cur2, amountReceivedOrig,cur1,fmt,isl)
writer.write(line)
formatted = dt.fread(outPath)
formatted = formatted[:,:,sort(3)]
formatted.to_csv(outPath)