Skip to content

Commit

Permalink
all novels changed to new workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
safirex committed Aug 6, 2022
1 parent d8ca84c commit 4a07d36
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 51 deletions.
4 changes: 3 additions & 1 deletion src/Chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ def processChapter(self,headers):


def parseTitle(self,html) -> str:
print("still in novel")
"""returns the title of the page"""
pass

def parseContent(self,html):
"""returns the content of the page"""
pass


Expand Down Expand Up @@ -88,6 +89,7 @@ def createFile(self,dir):
file.write(self.content)
file.close()
print('\n\n')



class KakyomuChapter(Chapter):
Expand Down
81 changes: 31 additions & 50 deletions src/Downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,18 @@ def onChapterListFetched(self):
print("chapter list obtained")



# TODO: updateObject should be in a NovelFactory
class Novel(NovelCallbacks):
def __init__(self, codeNovel, titreNovel, keep_text_format=False):
super(Novel, self).__init__()
super().__init__()
self.code = codeNovel
self.titre = titreNovel
self.keep_text_format = keep_text_format
self.headers = ''

# should be used to return a
if(type(self)==Novel):
self.updateObject()
# if(type(self)==Novel):
# print("i automatically update this shit")
# self.updateObject()
if(type(self)!=Novel):
self.setUrl()
self.setDir('./novel_list/'+self.code+' '+self.titre)
Expand Down Expand Up @@ -193,7 +193,6 @@ def processNovel(self):
html = self.fetchTOCPage();
# get the number of chapters (solely for user feedback)
online_chapter_list = self.parseOnlineChapterList(html)

if (self.getLastChapter() == 0):
resumeContent = self.parseTocResume(html)
# self.save("0_TOC",resumeContent)
Expand All @@ -213,21 +212,23 @@ def processNovel(self):
def processChapter(self, chapList):
""" download every chapter of the list """
for chapter_num in chapList:
print(chapter_num)
chap = self.getChapter(chapter_num)
print(chap)
chap.createFile(self.dir + '/')
pass
def getChapter(self,chapter_num) ->Chapter:
"""return the subclass chapter type"""
pass

def updatePerDate(self,html):
"""check if local files are outdate compared to online chapters"""
pass

class SyosetuNovel(Novel):
def __init__(self, Novel):
self.site = 'https://ncode.syosetu.com/'
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
super(SyosetuNovel, self).__init__(Novel.code, Novel.titre, Novel.keep_text_format)
super().__init__(Novel.code, Novel.titre, Novel.keep_text_format)

def setUrl(self):
self.url = self.site + self.code + "/"
Expand Down Expand Up @@ -267,28 +268,12 @@ def updatePerDate(self, html):
print("fin update")

def parseOnlineChapterList(self, html='') -> list:
# if html == '':
# html = self.html
# online_chap_list = []
# online_chapter_list = re.findall(
# r'<a href="/' + self.code + '/' + '(.*?)' + '/">.*?</a>', html, re.S)
# if (online_chapter_list is None or len(online_chapter_list) == 0):
# print("the novel has most likely been terminated\n")

if html == '':
html = self.html
online_chapter_list = re.findall(
r'<a href="/' + self.code + '/' + '(.*?)' + '/">.*?</a>', html, re.S)

if (online_chapter_list is None or len(online_chapter_list) == 0):
print("the novel has most likely been terminated\n")

# soup = BeautifulSoup(html, 'html.parser')
# online_chap_list = []
# divs = soup.find_all("dl","novel_sublist2")
# for div in divs:
# online_chap_list.append( div.find("a"))
# print(online_chap_list)
return online_chapter_list

def fetchTOCPage(self):
Expand All @@ -304,7 +289,7 @@ def fetchTOCPage(self):

def parseTocResume(self, html):
soup = BeautifulSoup(html, 'html.parser')
resume = soup.find_all("div", id="novel_ex")
resume = soup.find("div", id="novel_ex")
# resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
if (resume is None):
print("the novel has most likely been terminated")
Expand All @@ -318,8 +303,6 @@ def getChapter(self, chapter_num):
chapter = SyosetuChapter(self.code, chapter_num)
chapter.processChapter(self.headers)
return chapter
# self.createFile(i,chapter_title,chapter_content)
# self.setLastChapter(i)

def cleanText(self, chapter_content):
chapter_content = chapter_content.replace('</p>', '\r\n')
Expand All @@ -340,19 +323,8 @@ def validateTitle(self, title):
return new_title

def getNovelTitle(self,html):
# url = 'https://ncode.syosetu.com/%s/' % self.code
# headers = {
# "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
# print('accessing: ' + url)
# print()
# rep = requests.get(url, headers=headers)
# rep.encoding = 'utf-8'
# html = rep.text
# self.url = url
# html = self.fetchTOCPage()
writer = re.findall(r'<p class="novel_title">(.*?)</p>', html, re.S)
print('title = ')
print(writer)
print('title = '+str(writer))
return writer[0]


Expand Down Expand Up @@ -390,16 +362,14 @@ def parseTitle(self, TocHTML):

def parseOnlineChapterList(self, html) -> list:
soup = BeautifulSoup(html, 'html.parser')
online_chap_list = []
print(soup.find_all("a", "widget-toc-chapter"))
print("end")
# print(soup.find_all("a", "widget-toc-chapter"))
# print("end")
soup = soup.find('div', "widget-toc-main")
regex = str(self.code) + "/episodes/"
# regex = '/episodes/">(?P<num>.*?)</a>'
chapList = []
if (soup is not None):
chapList = soup.find_all(href=re.compile(regex))[
self.getLastChapter():]
chapList = soup.find_all(href=re.compile(regex))

for i in range(0, len(chapList)):
# list should contain links and not number because can't be found from relative way
Expand All @@ -420,22 +390,34 @@ def getChapter(self,chapter_num) ->Chapter:



class N18SyosetuNovel(SyosetuNovel, Novel):
class N18SyosetuNovel(Novel):

def __init__(self, novel):
novel.setCode(novel.code[3:])
super(N18SyosetuNovel, self).__init__(novel)

novel.setCode(novel.code[3:])
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
self.site = 'https://novel18.syosetu.com'
# SyosetuNovel.__init__(self,novel)
super().__init__(novel.code, novel.titre, novel.keep_text_format)
# self.cookie={'autologin':getCookies()}

def processNovel(self):
import http.cookiejar as cookielib
import mechanize
import sys
print("sysosetu novel " + self.titre)
print('last chapter: ' + str(self.getLastChapter()))

url = self.site + '/%s/' % self.code
print('accessing: ' + url)
print()
html = self.connectViaMechanize(url)
try:
html = self.connectViaMechanize(url)
except (mechanize.HTTPError,mechanize.URLError) as e:
print('novel has been stopped')
return ''


if (self.getLastChapter() == 0):
self.processTocResume(html)
Expand Down Expand Up @@ -490,7 +472,6 @@ def __createFile__(self, chapterNumber, chapter_title, chapter_content):

def connectViaMechanize(self, url):
import http.cookiejar as cookielib
from bs4 import BeautifulSoup
import mechanize

print('beginning server cracking beep boop')
Expand Down
1 change: 1 addition & 0 deletions src/main_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def archiveUpdate(dirList=[],keep_text_format=False):

#let's update the archive
novel.setDir('./novel_list/'+novel_folder)
print(type(novel))
novel.processNovel()


Expand Down

0 comments on commit 4a07d36

Please sign in to comment.