Skip to content

Commit

Permalink
fix n18n parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
safirex committed Dec 30, 2024
1 parent a66d1d3 commit e160e97
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
17 changes: 9 additions & 8 deletions src/Chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,19 @@ def setUrl(self):
self.url='https://novel18.syosetu.com/%s/%s/'%(self.novelNum,self.num)

def parseContent(self,html):
chapter_content=re.findall(r'<div class="novel_view" id="novel_honbun">(.*?)</div>',html,re.S)[0]
replacething=re.findall(r'<p id=' + '.*?' + '>', chapter_content)
for y in replacething:
chapter_content=chapter_content.replace(y,'')
chapter_content=self.cleanText(chapter_content)
content = BeautifulSoup(html,'html.parser').find('div','p-novel__body')
if content:
chapter_content = content.text
else :
raise Exception("failed to parse the chapter content")

self.setContent(chapter_content)
return chapter_content


def parseTitle(self, html) -> str:
soup = BeautifulSoup(html, 'html.parser')
title = soup.find("p","novel_subtitle").text
title = soup.find("h1").text
return title

def createFile(self, path):
Expand All @@ -185,7 +186,7 @@ def setUrl(self,url):
self.url=url

def getTitle(self,html):
soup = BeautifulSoup(html)
soup = BeautifulSoup(html,"html.parser")
title=''
for h in soup.find_all('title'):
title=h.string
Expand All @@ -201,7 +202,7 @@ def getTitle(self,html):
def getContent(self,html):

#can be made better with soup.id["chapter-content"]
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, "html.parser")
chapter_content=''
for div in soup.find_all('div'):
id=div.get("id")
Expand Down
9 changes: 4 additions & 5 deletions src/Downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def parseOnlineChapterList(self, html='') -> list:
if html == '':
html = self.html
while not done:
soup = BeautifulSoup(html)
soup = BeautifulSoup(html,"html.parser")
online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
nextPage = soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
nextPage = list(filter( lambda x: "c-pager__item--next" in x['class'], nextPage))
Expand Down Expand Up @@ -527,10 +527,8 @@ def processNovel(self):
if (self.getLastChapter() == 0):
self.processTocResume(html)
# get the number of chapters (solely for user feedback)
online_chapter_list = re.findall(
'<a href="/' + self.code + '/' + '(.*?)' + '/">', html, re.DOTALL)
online_chapter_list = re.findall('href="/' + self.code + '/(\d+)/"', html)

print('<href="/' + self.code + '/' + '(.*?)' + '/">')
lastDL = self.getLastChapter()
online_chapter_list = online_chapter_list[lastDL:]
print("there are %d chapters to udpate" % len(online_chapter_list))
Expand All @@ -555,7 +553,8 @@ def processTocResume(self, html):
def processChapter(self, chapter_num):
chapter = N18SyosetuChapter(self.code, chapter_num)
chapter_html = self.connectViaMechanize(
'%s/%s/%s/' % (self.site, self.code, chapter_num))
'%s/%s/%s/' % (self.site, self.code, chapter_num)
)
chapter.setTitle(chapter.parseTitle(chapter_html))
chapter.setContent(chapter.parseContent(chapter_html))
return chapter
Expand Down

0 comments on commit e160e97

Please sign in to comment.