-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_all.py
130 lines (104 loc) · 4.51 KB
/
clean_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 10 14:36:04 2019
@author: lavanyasingh
"""
import urllib.parse
import csv
import helpers
class Truncator():
def __init__(self, outfile="data/all_raw_cleaned2.csv"):
self.outfile = outfile
def clean_meta(self, meta):
# cleans metasource name
return meta.lower().replace(" ", "").strip()
def prep_url(self, url):
# strip schema and www. from a given URL
url = (url.replace("http://", "").replace("https://", "")
.replace("www.", ""))
return "http://www." + url
def url_is_good(self, url):
# garbage URLs as determined by spotchecking
return (url != "url" and url != "" and url != "www." and
url != 'www.source url' and url != 'www.www.source url')
def path_is_good(self, path):
# we want to exclude generic paths
bad = ['/robots.txt', 'html', '/css', '/js', '/favicon.ico', '/img',
'.json', '.js', ".png", ".jpg"]
for el in bad:
if path.find(el) != -1: return False
# we want to exclude empty paths
if (path == '/' or path == "" or path == "/\n"):
return False
# we want to exclude long paths
if len(path.split("/")) > 3:
return False
# we want to exclude paths that are purely numerical
try:
int(path.strip().replace("/", ""))
return False
except:
return True
#outputs a dict of url: CSV rows
def make_all_data(self, infile):
rows = {}
with open('data/' + infile, 'r') as inf:
reader = csv.reader(inf, delimiter=',')
for line in reader:
# clean URL
url = helpers.truncate(line[1])
line[1] = url
if len(rows) % 5000 == 0: print(url)
# get path ('/money' or '/index' for example)
o = urllib.parse.urlparse(self.prep_url(line[1]))
path = o.path
# add metasource
metasource = self.clean_meta(line[7])
line[7] = [metasource]
# check for various spellings of USA
if (line[0] == "United States" or line[0].lower() == "us"
or line[0].lower() == "usa") :
line[0] == "United States of America"
# extend row
line += ['', '']
# if unique url, add path
if url not in rows:
if self.url_is_good(url):
if self.path_is_good(path):
line[13] = [path]
rows.update({url: line})
else:
# add metasource if necessary
if metasource not in rows[url][7]:
rows[url][7].append(metasource)
# update any broken metadata to new value
for i in range(len(rows[url]) - 1):
if helpers.is_bad(rows[url][i]):
try:
rows[url][i] = line[i]
except:
pass
# add path if good
if self.path_is_good(path):
rows[url][13].append(path)
return rows
def write_all_data(self, rows):
with open(self.outfile, 'w') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"',
quoting = csv.QUOTE_MINIMAL)
for row in rows.values():
metasources = ""
for item in row[7]:
metasources += item + " "
paths = ""
for item in row[13]:
paths += item + " "
row[7] = metasources
row[13] = paths
w.writerow(row)
print("DONE WRITING")
if __name__ == "__main__":
truncator = Truncator()
rows = truncator.make_all_data("all_raw.csv")
truncator.write_all_data(rows)