-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_consolidate.py
128 lines (112 loc) · 5.05 KB
/
clean_consolidate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 9 11:27:48 2019
@author: lavanyasingh
"""
import csv
# I used this script to consolidate the sources into one CSV file from all the
# little ones I had. Those CSVs are no longer in this directory, as they have
# already been consolidated. I'm leaving this script in for reference.
class consolidator():
def __init__(self, outfile = 'data/all_raw.csv'):
self.outfile = outfile
# get metasource from path name
def get_meta(self, path):
return path.split('.')[0].replace('data/', '')
# the US news data was structured differently from the rest of it
def us_news(self):
with open('data/us_news.csv', 'r') as inf:
reader = csv.reader(inf, delimiter=',')
next(reader)
with open(self.outfile, 'a+') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
w.writerow(['United States', line[2], line[1], 'English',
line[3], line[1], '', 'original', line[0], '',
'', '', ''])
# USNPL data was also structured differently
def usnpl(self):
with open('data/usnpl_wiki_list.csv', 'r') as inf:
reader = csv.reader(inf, delimiter=',')
next(reader)
with open(self.outfile, 'a+') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
w.writerow(['United States', line[3], line[2], 'English',
'Newspaper', line[2],'', 'usnpl', line[0],
line[1], line[4], line[6], line[7]])
# LION data was also structured differently
def lion(self):
with open('data/lion.csv', 'r') as inf:
reader = csv.reader(inf, delimiter=',')
next(reader)
with open(self.outfile, 'a+') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
w.writerow(['United States', line[1], line[0], 'English',
'', line[0], '', 'lion', line[5], line[4], '',
'', ''])
# reads in data that has been csv formatted (that I've cleaned before)
def formatted(self, path):
with open(path, 'r') as inf:
reader = csv.reader(inf, delimiter=',')
next(reader)
with open(self.outfile, 'a+') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
row = line + ['' for n in range(9)]
row[7] = self.get_meta(path)
w.writerow(row)
print("DONE WITH ", path)
# reads in data from a text file of URLS
def txt(self, path):
with open(path, 'r') as inf:
with open(self.outfile, 'a+') as outf:
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in inf:
row = [line if n == 1 else '' for n in range(12)]
row[7] = self.get_meta(path)
w.writerow(row)
print ("DONE WITH ", path)
def dmoz(self):
with open("data/dmoz.txt", "r") as inf, open(self.outfile, 'a+') as outf:
reader = csv.reader(inf, delimiter='\t')
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
row = ["" for i in range(13)]
row[1] = line[1]
row[7] = "DMOZ"
w.writerow(row)
def common_crawl(self):
with open("data/common_crawl.txt", "r") as inf, open(self.outfile, 'a+') as outf:
reader = csv.reader(inf, delimiter='\t')
w = csv.writer(outf, delimiter= ',', quotechar = '"', quoting =
csv.QUOTE_MINIMAL)
for line in reader:
row = ["" for i in range(13)]
row[1] = line[1]
row[7] = "Common Crawl"
w.writerow(row)
def main(self):
self.us_news()
self.usnpl()
self.lion()
self.formatted('data/wikinews.csv')
self.formatted('data/wikidata.csv')
self.txt('data/topnews')
self.formatted('data/newsgrabber.csv')
self.txt('data/newscrawls')
self.formatted('data/mediacloud.csv')
self.formatted('data/inkdrop.csv')
self.txt('data/gdelt')
self.formatted('data/datastreamer.csv')
if __name__ == '__main__':
consolidator = consolidator()
consolidator.common_crawl()