fix n18n parsers

safirex · Dec 30, 2024 · e160e97 · e160e97
1 parent a66d1d3
commit e160e97
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 13 deletions.
diff --git a/src/Chapters.py b/src/Chapters.py
@@ -151,18 +151,19 @@ def setUrl(self):
         self.url='https://novel18.syosetu.com/%s/%s/'%(self.novelNum,self.num)
 
     def parseContent(self,html):
-        chapter_content=re.findall(r'<div class="novel_view" id="novel_honbun">(.*?)</div>',html,re.S)[0]
-        replacething=re.findall(r'<p id=' + '.*?' + '>', chapter_content)
-        for y in replacething:
-            chapter_content=chapter_content.replace(y,'')
-        chapter_content=self.cleanText(chapter_content)
+        content = BeautifulSoup(html,'html.parser').find('div','p-novel__body')
+        if content:
+            chapter_content = content.text
+        else :
+            raise Exception("failed to parse the chapter content")
+
         self.setContent(chapter_content)
         return chapter_content
 
 
     def parseTitle(self, html) -> str:
         soup = BeautifulSoup(html, 'html.parser')
-        title = soup.find("p","novel_subtitle").text
+        title = soup.find("h1").text
         return title
 
     def createFile(self, path):
@@ -185,7 +186,7 @@ def setUrl(self,url):
         self.url=url
 
     def getTitle(self,html):
-        soup = BeautifulSoup(html)
+        soup = BeautifulSoup(html,"html.parser")
         title=''
         for h in soup.find_all('title'):
             title=h.string
@@ -201,7 +202,7 @@ def getTitle(self,html):
     def getContent(self,html):
 
         #can be made better with soup.id["chapter-content"]
-        soup = BeautifulSoup(html)
+        soup = BeautifulSoup(html, "html.parser")
         chapter_content=''
         for div in soup.find_all('div'):
             id=div.get("id")

diff --git a/src/Downloaders.py b/src/Downloaders.py
@@ -333,7 +333,7 @@ def parseOnlineChapterList(self, html='') -> list:
         if html == '':
             html = self.html
         while not done:
-            soup = BeautifulSoup(html)
+            soup = BeautifulSoup(html,"html.parser")
             online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
             nextPage =  soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
             nextPage =  list(filter( lambda x: "c-pager__item--next" in  x['class'], nextPage))
@@ -527,10 +527,8 @@ def processNovel(self):
         if (self.getLastChapter() == 0):
             self.processTocResume(html)
         # get the number of chapters (solely for user feedback)
-        online_chapter_list = re.findall(
-            '<a href="/' + self.code + '/' + '(.*?)' + '/">', html, re.DOTALL)
+        online_chapter_list = re.findall('href="/' + self.code + '/(\d+)/"', html)
 
-        print('<href="/' + self.code + '/' + '(.*?)' + '/">')
         lastDL = self.getLastChapter()
         online_chapter_list = online_chapter_list[lastDL:]
         print("there are %d chapters to udpate" % len(online_chapter_list))
@@ -555,7 +553,8 @@ def processTocResume(self, html):
     def processChapter(self, chapter_num):
         chapter = N18SyosetuChapter(self.code, chapter_num)
         chapter_html = self.connectViaMechanize(
-            '%s/%s/%s/' % (self.site, self.code, chapter_num))
+            '%s/%s/%s/' % (self.site, self.code, chapter_num)
+        )
         chapter.setTitle(chapter.parseTitle(chapter_html))
         chapter.setContent(chapter.parseContent(chapter_html))
         return chapter