-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnytimes_to_csv.py
119 lines (84 loc) · 3.7 KB
/
nytimes_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# coding: utf-8
# #CSV builder for The New York Times
# In[1]:
import math
from blaze import Data, DataFrame
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from slugify import slugify
# In[2]:
client = MongoClient()
db = client.nytimes3
# In[3]:
total = db.articles.count()
percent = math.ceil(total / 100)
count = 0
print('Total documents:', total)
print()
total_rows_df = DataFrame()
try:
for doc in db.articles.find():
rows_df = DataFrame()
try:
# Texts
common_texts = []
if doc.get('abstract'):
common_texts.append(doc['abstract'])
if doc.get('headline') and isinstance(doc['headline'], dict) and doc['headline'].get('main'):
common_texts.append(doc['headline']['main'])
if doc.get('lead_paragraph'):
common_texts.append(doc['lead_paragraph'])
# add snippet as variable field
# Fix fields
article_id = doc['_id']
pub_date = doc['pub_date']
section_name = doc['section_name']
web_url = doc['web_url']
# Variable fields
for term in doc['q_info']:
texts = list(common_texts)
q_info = doc['q_info'][term]
term_category = q_info[0]['term_category']
search_terms = []
for info in q_info:
search_terms.append(info['q']['term'])
if info['snippet']:
texts.append(info['snippet'])
for text in texts:
sentences = sent_tokenize(text)
for sentence in sentences:
if any([slugify(search_term) in slugify(sentence) for search_term in search_terms]):
term_aux = term.replace('_', '.')
sentence_aux = sentence.replace('<strong>', '').replace('</strong>', '').replace(',', '')
if article_id and pub_date and section_name and web_url and term_category and term_aux and sentence_aux:
row_df = DataFrame(
[[article_id, pub_date, section_name, web_url, term_category, term_aux, sentence_aux]],
columns=['article_id', 'pub_date', 'section_name', 'web_url', 'term_category', 'term', 'sentence']
)
rows_df = rows_df.append(row_df)
total_rows_df = total_rows_df.append(rows_df)
if count % percent == 0:
percentage = count // percent
print('{} out of {} processed.'.format(count, total))
print('{}% completed.'.format(percentage))
print()
total_rows_df.to_csv('total_rows_{}.csv'.format(percentage), index=False)
count +=1
except Exception as e:
print('Error:', e)
print('Document:', doc)
print('{} out of {} processed.'.format(total, total))
print('100% completed.')
total_rows_df.to_csv('total_rows_100.csv', index=False)
except Exception as e:
print('Error:', e)
total_rows_df.to_csv('total_rows.csv', index=False)
# In[4]:
# total_rows_df['term'] = total_rows_df['term'].apply(lambda x: x.replace('_', '.'))
# total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace('<strong>', '').replace('</strong>', ''))
# total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace(',', '')) # csv delimiter problems
# In[5]:
# total_rows_df.to_csv('total_rows.csv', index=False)
# In[6]:
print(total_rows_df['term'].value_counts())
# In[ ]: