-
Notifications
You must be signed in to change notification settings - Fork 2
/
bytes_parser.py
41 lines (37 loc) · 1 KB
/
bytes_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
import sys
import os
import re
import pandas as pd
from multiprocessing import Pool
def cruncher(file):
y = {};z = {}
z['file'] = file
with open(file,'r')as f:
for line in f:
if not "?" in line:
line = line[8:].replace("\n","").strip()
li = line.split(" ")
for i in range(len(li)-data+1):
x = ''.join(li[i:i+data])
x = ' '.join(re.findall(r'.{1,2}', x))
if x in y.keys():
y[x] = y[x] + 1
else:
y[x] = 1
if y[x] >= 200:
z[x] = y[x]
return z
data = int(sys.argv[1])
o_path = os.getcwd()
os.chdir(os.path.join(os.getcwd(), "Dataset", "bytes"))
filelist = [i for i in os.listdir() if ".bytes" in i]
df = 0
with Pool(int(sys.argv[2])) as p:
df = p.map(cruncher, filelist)
os.chdir(o_path)
frame = pd.DataFrame(df)
frame = frame.set_index('file')
pd.options.display.float_format = '{:.0f}'.format
print(frame, "\n")
frame.to_csv(os.path.join("results",f'ngram_b{data}.csv'), sep = ',', encoding = 'utf-8', index = True, float_format="%.0f")