-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathexport.py
127 lines (102 loc) · 3.73 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Generates the coder table independently of the dashboard.
"""
from database import db_session
from models import ArticleMetadata, User
import config
from sqlalchemy.exc import OperationalError
from sqlalchemy import func, desc, distinct, or_
from sqlalchemy.sql import select, func
from sqlalchemy.schema import Table
import pandas as pd
import datetime as dt
import os
def validate( x ):
""" replace newlines, returns, and tabs with blank space """
if x:
if type(x) == str:
x = x.replace("\n", " ")
x = x.replace("\r", " ")
x = x.replace("\t", " ")
return x
else:
return "0"
def exportAnnotations(model, filename_base):
users = {u.id: u.username for u in db_session.query(User).all()}
cols = [x.name for x in model.__table__.columns]
resultset = []
filename = os.path.join(
config.WD,
"exports",
filename_base + '_%s.csv' % (dt.datetime.now().strftime('%Y-%m-%d_%H%M%S')))
print(filename)
query = db_session.query(func.max(model.id)).first()
print("Query:")
for i in range(0, 1000):
if i % 50 == 0:
print((" " + str(i) + "..."))
offset = i*1000
query = db_session.query(model, ArticleMetadata).\
join(ArticleMetadata).order_by(model.id).offset(offset).limit(1000).all()
if len(query) <= 0:
print((" " + str(i) + "...DONE"))
break
resultset.extend(query)
## do this in chunks to save memory
header = cols[:]
header.extend(['publication', 'pub_date', 'solr_id'])
## write headers
df = pd.DataFrame([], columns = header)
df.to_csv(filename, encoding = 'utf-8', index = False)
i = 0
print("Resultset:")
for row in resultset:
fp = row[0]
am = row[1]
i += 1
## store all fields in a row in a tuple
to_print = ()
for c in cols:
if c == 'coder_id':
to_print += ( users[fp.__getattribute__(c)], )
else:
to_print += ( validate(fp.__getattribute__(c)), )
## add publication, publication date, and solr_id
pub = ''
pub_date = ''
solr_id = am.db_id
if am.db_id is None:
pass
elif 'AGW' in am.db_id:
## e.g.
## AGW_AFP_ENG_20040104.0056
pieces = am.db_id.split("_")
pub = "-".join(pieces[0:3])
pub_date = pieces[3].split('.')[0]
pub_date = dt.datetime.strptime(pub_date, '%Y%m%d').strftime('%Y-%m-%d')
elif 'NYT' in am.db_id:
## e.g.
## 1989/03/11/0230638.xml_LDC_NYT
pub = 'NYT'
pub_date = am.db_id[0:10].replace('/', '-')
else:
## e.g.
## Caribbean-Today;-Miami_1996-12-31_26b696eae2887c8cf71735a33eb39771
## HOWEVER
## North-Carolina-A-_-T-State-University_The-A_T-Register_2018-03-21T00:00:00Z_96c6bc5e6701f8b1ccbba7a318cc50ad
pieces = am.db_id.split("_")
if len(pieces) > 3:
pub = '_'.join(pieces[0:-2])
else:
pub = pieces[0]
pub_date = pieces[-2]
## remove T00:00:00Z from dates
## e.g. 2013-03-19T00:00:00Z
if 'T' in pub_date:
pub_date = pub_date.split('T')[0]
to_print += ( pub, pub_date, solr_id )
df = pd.DataFrame([to_print], columns = header)
df.to_csv(filename, mode = "a", header = False, index = False, encoding = 'utf-8')
if i % 50000 == 0:
print((" " + str(i) + "..."))
print((" " + str(i) + "...DONE"))