This repository has been archived by the owner on Feb 7, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdatabase_updator.py
299 lines (275 loc) · 10.5 KB
/
database_updator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# GUI module generated by PAGE version 4.14
# In conjunction with Tcl version 8.6
# Aug 13, 2018 03:57:27 PM
import urllib.parse
import urllib.request
from urllib.error import HTTPError, URLError
import shutil
import sys
import locale
import codecs
import os
import os.path
import sqlite3 as sql
from bs4 import BeautifulSoup
import re
import json
import traceback
import html
parent = None
conn = None
cursor = None
exclusion_novel_list = [
# 'Condemning the Heavens',
# 'Blue Phoenix'
]
expeled_novel_list = [
'Coiling Dragon'
]
alt_cover_list = {
# '7 Killers': 'https://image.ibb.co/fAgv6U/7k.png',
'Warlock of the Magus World': 'https://image.ibb.co/gEOTt9/600.jpg',
'Overthrowing Fate': 'https://image.ibb.co/m6SWyK/otf.png',
'Legends of Ogre Gate': 'https://image.ibb.co/myNLse/loog_1.png',
'Blue Phoenix': 'https://image.ibb.co/i4q6Xe/bp_1.png',
'The Divine Elements': 'https://image.ibb.co/ceG58K/tde_1.png',
'Condemning the Heavens': 'https://image.ibb.co/mTK8TK/cth_1.png'
}
exception_names_list = {#for finding them on novelupdates.com
'Legend of the Dragon King': 'the-legend-of-the-dragon-king',
'Soul Land 2: The Unrivaled Tang Sect': 'douluo-dalu-2-the-unrivaled-tang-sect',
'Soul Land 3: Legend of the Dragon King': 'the-legend-of-the-dragon-king',
'Desolate Era': 'the-desolate-era',
'Stellar Transformations': 'stellar-transformation',
'Overlord of Blood and Iron': 'the-overlord-of-blood-and-iron'
}
def unicodeToHTMLEntities(text):
"""Converts unicode to HTML entities. For example '&' becomes '&'."""
text = html.escape(text, True).encode('ascii', 'xmlcharrefreplace').decode('ascii')
return text
def download(link, file_name, ur_data=None):
url = urllib.request.Request(
link,
data=ur_data,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
with urllib.request.urlopen(url) as response, open(file_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
def get_cookie(link):
url = urllib.request.Request(
link,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
response = urllib.request.urlopen(url)
cookie = ''
rep = response.getheader('set-cookie')
rep = rep.split('secure, ')
for coo in rep:
tib = coo.split(';')[0].split('=')
if tib[0] == '_csrfToken':
cookie = tib[1]
return cookie
def insert_wuxiaworld_novel(name, url):
global conn, cursor, alt_cover_list, exception_names_list
dir = os.path.dirname(os.path.realpath(__file__))
if os.name == 'nt':
dir = os.path.expanduser("~") + os.sep + "wuxiaworld_export_ebook"
if os.path.isdir(dir) is False:
os.mkdir(dir)
if os.path.isdir(dir + os.sep + "tmp") is False:
os.mkdir(dir + os.sep + "tmp")
if os.path.isdir(dir + os.sep + "export") is False:
os.mkdir(dir + os.sep + "export")
#name
#url
autors = ''
img = ''
if name in expeled_novel_list: return
translator = ''
synopsis = ''
filename_novelupdate = dir+os.sep+"tmp"+os.sep+"novel_"+urllib.parse.quote(name)+".html"
filename_wuxiaworld = dir+os.sep+"tmp"+os.sep+"wuxia_"+urllib.parse.quote(name)+".html"
url_novelupdate = ''
if name in exception_names_list: url_novelupdate = "https://www.novelupdates.com/series/"+exception_names_list[name]+"/"
else: url_novelupdate = "https://www.novelupdates.com/series/"+name.lower().replace("&", 'and').replace("’", '').replace("'", '').replace(":", '').replace(" ", '-')+"/"
try: download(url, filename_wuxiaworld)
except Exception as e:
print('URL: {}, URLError: {} - {}'.format(url, e.code, e.reason))
else:
try: download(url_novelupdate, filename_novelupdate)
except Exception as e:
print('URL: {}, URLError: {} - {}'.format(url_novelupdate, e.code, e.reason))
filename_novelupdate = None
fileHandle1 = open(filename_wuxiaworld, "r", encoding = "utf8")
if filename_novelupdate is not None:
#Get data from novelupdates.com
fileHandle2 = open(filename_novelupdate, "r", encoding = "utf8")
soup = BeautifulSoup(fileHandle2, 'html.parser')
if name in alt_cover_list:
img = alt_cover_list[name]
else: img = soup.find(class_='seriesimg').find('img').get('src')
synopsis = soup.find(id='editdescription').get_text()
autors = ''
dom_authors = soup.find(id='showauthors').find_all('a')
for aut in dom_authors:
if autors != '': autors += ', '
autors += aut.string
soup = None
fileHandle2.close()
os.remove(filename_novelupdate)
#Get data from wuxiaworld.com
soup = BeautifulSoup(fileHandle1, 'html.parser')
nov = soup.find(class_='novel-body')
translator = nov.find_all('dd')[0].get_text()
soup = None
fileHandle1.close();
os.remove(filename_wuxiaworld)
else:
#Get data from wuxiaworld.com
soup = BeautifulSoup(fileHandle1, 'html.parser')
nov = soup.find(class_='media media-novel-index')
if name in alt_cover_list: img = alt_cover_list[name]
else: img = nov.find('img').get('src')
dom_authors = None
if nov is not None:
dom_authors = nov.find_all('p')
dom_authors += nov.find_all('dl')
for aut in dom_authors:
if len(aut.contents) >= 1:
strmade = ''
for piece in aut.contents:
if piece.string is not None:
strmade += piece.string
if 'Author:' in strmade or 'Translator:' in strmade:
clean = strmade.replace('Author:', '').replace('Translator:', '').replace("\n", '').replace(
"\r", '').replace("\t", '').strip()
if clean not in autors:
if autors != '': autors += ', '
autors += clean
dom_translators = nov.find_all('dl')
for aut in dom_translators:
if len(aut.contents) >= 1:
strmade = ''
for piece in aut.contents:
if piece.string is not None:
strmade += piece.string
if 'Translator:' in strmade:
clean = strmade.replace('Translator:', '').replace("\n", '').replace("\r", '').replace("\t", '').strip()
if clean not in translator and clean not in autors:
if translator != '': translator += ', '
translator += clean
fileHandle1.close();
os.remove(filename_wuxiaworld)
cursor.execute("INSERT INTO Information(NovelName,link,autor,cover,limited,translator,synopsis,source) VALUES(?,?,?,?,?,?,?,?)", (name, url, autors, img, 0, translator, synopsis, 'wuxiaworld.com'))
conn.commit()
def start():
global conn, cursor, parent
dir = os.path.dirname(os.path.realpath(__file__))
if os.name == 'nt':
dir = os.path.expanduser("~") + os.sep + "wuxiaworld_export_ebook"
conn = sql.connect(dir+os.sep+"novels.db")
cursor = conn.cursor()
if os.path.isdir(dir+os.sep+'tmp') is False:
os.mkdir(dir+os.sep+'tmp')
list_novel = []
filename = dir+os.sep+"tmp"+os.sep+"wuxiaworld_updates.json"
baseurl = 'https://www.wuxiaworld.com/api/novels/search'
if parent is not None: parent.emit(['Database Update, Download Wuxiaworld Ongoing novel list', 1])
else: print('>> Download Wuxiaworld Ongoing novel list')
try:
request = urllib.request.Request(
baseurl,
data=bytes('{"title":"","tags":[],"language":"Any","genres":[],"active":null,"sortType":"Name","sortAsc":true,"searchAfter":0,"count":9999}', 'utf-8'),
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
'content-type': 'application/json;charset=UTF-8'
}
)
with urllib.request.urlopen(request) as response, open(filename, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
except HTTPError as e:
# Return code error (e.g. 404, 501, ...)
print('URL: {}, HTTPError: {} - {}'.format(baseurl, e.code, e.reason))
except URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
print('URL: {}, URLError: {}'.format(baseurl, e.reason))
else:
cursor.execute("DELETE FROM 'Information'")
conn.commit()
with open(filename, 'r', encoding = "utf8") as file:
data = json.load(file)
#print(data)
#print(data['items'])
for nov in data['items']:
# print(nov)
#print(nov['name'])
if nov['sneakPeek'] is False:
list_novel.append({'name': nov['name'], 'url': 'https://www.wuxiaworld.com/novel/'+nov['slug']})
os.remove(filename)
"""
filename = dir+os.sep+"tmp"+os.sep+"wuxiaworld_completed.html"
baseurl = 'https://www.wuxiaworld.com/tag/completed'
if parent is not None: parent.emit(['Database Update, Download Wuxiaworld Finished novel list', 1])
else: print('>> Download Wuxiaworld Finished novel list')
try:
download(baseurl, filename)
except HTTPError as e:
# Return code error (e.g. 404, 501, ...)
print('URL: {}, HTTPError: {} - {}'.format(baseurl, e.code, e.reason))
except URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
print('URL: {}, URLError: {}'.format(baseurl, e.reason))
else:
fileHandle = open(filename, "r", encoding = "utf8")
soup = BeautifulSoup(fileHandle, 'html.parser')
tab = soup.find(class_="media-list genres-list")
novels_dom = tab.find_all(class_="media")
for title in novels_dom:
name = title.find('h4').string.replace('’', "'").strip()
if name not in exclusion_novel_list:
url = title.find('a').get('href')
list_novel.append({'name': name, 'url': 'https://www.wuxiaworld.com'+url})
fileHandle.close()
os.remove(filename)
"""
baseurl = 'https://www.webnovel.com'
cookie = ''
if parent is not None: parent.emit(['Database Update, Get webnovel.com cookie', 1])
else: print('>> Download Wuxiaworld Finished novel list')
try:
cookie = get_cookie(baseurl)
except HTTPError as e:
# Return code error (e.g. 404, 501, ...)
print('URL: {}, HTTPError: {} - {}'.format(baseurl, e.code, e.reason))
except URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
print('URL: {}, URLError: {}'.format(baseurl, e.reason))
else:
print(cookie)
#https://www.webnovel.com/apiajax/search/PageAjax?_csrfToken=mC5uuDqyBFhY9KB5j29dzhiNnLUIHmQApnufp398&isExternal=0&pageSize=20&pageIndex=3&keywords=%
pos = 1.0
step = 99.0 / float(len(list_novel))
# print(list_novel)
for novel in list_novel:
pos += step
if parent is not None:
parent.emit(['Database Update, Processing "{}"'.format(novel['name']), int(pos)])
else:
print('=> Processing:', unicodeToHTMLEntities(novel['name']))
try:
insert_wuxiaworld_novel(novel['name'], novel['url'])
except:
traceback.print_exc()
cursor = None
conn.close()
print('< Database Update Completed')
if __name__ == '__main__':
start()