-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdatabase_creation.py
180 lines (130 loc) · 5.75 KB
/
database_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import sqlite3
import json
from datetime import datetime
database_name = '2009-09' # the data being used is from september 2009.
global transaction = []
connection = sqlite3.connect('{}.db'.format(database_name))
c = connection.cursor()
# database table creation
def create_table():
c.execute(
"""CREATE TABLE IF NOT EXISTS reddit_data_table(parent_id TEXT PRIMARY KEY,
comment_id TEXT UNIQUE,
parent TEXT,
comment TEXT,
subreddit TEXT,
unix INT,
score INT)"""
)
# replace newlines with some other string
def format_data(data):
data = data.replace("\n"," newlinechar ")
data = data.replace("\r"," newlinechar ")
data = data.replace('"',"'")
return data
#return score of the entry with parent_id = pid
def find_existing_score(pid):
try:
sql = "SELECT score from reddit_data_table where parent_id = '{}' limit 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else:
return False
except Exception as e:
return False
#collect the queries and executes them together when number of total queries exceed 2000.
#any number can be set,bigger number speeds up the entry.
def transactions_in_bulk(sql):
transaction.append(sql) #append the query in list
if len(transaction)>2000:
c.execute('BEGIN TRANSACTION')
for s in transaction:
try:
c.execute(s)
except:
pass
connection.commit() #save the changes in database
transaction = [] #empty the list to store further
def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """update reddit_data_table set parent_id = ?, comment_id= ?,parent = ?,
comment = ?,subreddit = ?,unix=?,score=? where
parent_id = ?; """.format(parentid,commentid,parent,comment,subreddit,int(time),score,parentid)
transactions_in_bulk(sql)
except Exception as e:
print('update insertion',str(e))
def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """insert into reddit_data_table (parent_id, comment_id,parent,
comment,subreddit,unix,score) values
("{}","{}","{}","{}","{}",{},{}); """.format(parentid,commentid,parent,comment,subreddit,int(time),score)
transactions_in_bulk(sql)
except Exception as e:
print('parent insertion',str(e))
def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
try:
sql = """insert into reddit_data_table (parent_id, comment_id,
comment,subreddit,unix,score) values
("{}","{}","{}","{}",{},{}); """.format(parentid,commentid,comment,subreddit,int(time),score)
transactions_in_bulk(sql)
except Exception as e:
print('noparent insertion',str(e))
#check data being entered in database for length specifications.
def acceptable(data):
if len(data.split(' '))>50 or len(data)<1:
return False
elif len(data) >1000:
return False
elif data == '[deleted]' or data == '[removed]':
return False
else:
return True
#return comment whose comment id is pid,
#the comment being returned is parent of another comment(child comment)
#hence its comment_id = parent_id of child comment.
def find_parent(pid):
try:
sql = "SELECT comment FROM reddit_data_table WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else:
return False
except Exception as e:
return False
if __name__ == "__main__":
create_table()
row_count = 0 #count number of rows read.
paired_rows_count = 0 #count number of comments which got their parent from database using find_parent.
with open("projects/RC_2009/RC_2009",buffering = 1000) as f: #RC_2009 contains json data from reddit
for row in f:
row = json.loads(row)
row_count += 1
parent_id = row['parent_id']
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
subreddit = row['subreddit']
comment_id = row['name']
parent_data = find_parent(parent_id)
if score>=3: #set score depending on choice
if acceptable(body): #checking body to be of suitable length
existing_comment_score = find_existing_score(parent_id)
if existing_comment_score:
if score> existing_comment_score:
#comment with low score replaced by comment with higher score.
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if parent_data:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows_count +=1
else:
#comment without parent is inserted because it may be a parent of some other comment
#find_parent will associate it with its child if it encounters one.
sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
#print the log after every 1000 rows.
if row_count % 1000 ==0:
print("total rows read : {}, paired rows: {},time:{}".format(row_count,paired_rows_count,str(datetime.now())))