fix syosetu novel title parser

safirex · Dec 30, 2024 · f39fb2f · f39fb2f
1 parent e160e97
commit f39fb2f
Show file tree

Hide file tree

Showing 6 changed files with 177 additions and 54 deletions.
diff --git a/src/Downloaders.py b/src/Downloaders.py
@@ -230,7 +230,7 @@ def parseOnlineChapterList(self, html) -> list:
         """parse the list of chapters from the HTML of the TOC page"""
         raise("parseOnlineChapterList method is not defined")
 
-    def parseTocResume(self, html=''):
+    def parseTocResume(self, html='') -> str:
         """ format and interpret the content of the home page of the novel """
         warnings.warn("This class doesn't have a method to parse the table of content's resume.")
 
@@ -246,7 +246,10 @@ def processNovel(self):
         online_chapter_list = self.parseOnlineChapterList(html)
         if (self.getLastChapter() == 0):
             resumeContent = self.parseTocResume(html)
-            # self.save("0_TOC",resumeContent)
+            print('content of resume', resumeContent)
+
+            if resumeContent:
+                self.createFile(0,"TOC", resumeContent)
         if (len(online_chapter_list) >= 1):
 
             # get the chapters url
@@ -258,9 +261,10 @@ def processNovel(self):
             # will add new files for every revised chapters
             self.updatePerDate(html)
         else:
+            print("No chapters were found")
             print("this web novel has most likely been terminated")
 
-    def processChapter(self, chapList):
+    def processChapter(self, chapList:list):
         """ download every chapter of the list """
         for chapter_num in chapList:
             chap = self.getChapter(chapter_num)
@@ -328,21 +332,21 @@ def updatePerDate(self, html):
         print("fin update")
 
     def parseOnlineChapterList(self, html='') -> list:
-        online_chapter_list = []
+        chap_num_list = []
         done = False
         if html == '':
             html = self.html
         while not done:
             soup = BeautifulSoup(html,"html.parser")
-            online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
+            chap_num_list += re.findall('href="/' + self.code + '/(\d+)/"' ,html)
             nextPage =  soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
             nextPage =  list(filter( lambda x: "c-pager__item--next" in  x['class'], nextPage))
             if nextPage :
                 nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
                 html = self.fetchTOCPage(nextPageNum)
             else:
                 done = True
-        chap_num_list = list(map(lambda x: re.split('/',x['href'][:-1])[-1], online_chapter_list))
+
         if (chap_num_list is None or len(chap_num_list) == 0):
             print("the novel has most likely been terminated\n")
         return chap_num_list
@@ -354,23 +358,29 @@ def fetchTOCPage(self, page=0):
         headers = {
             "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
         }
+        print("fetching ",url)
         rep = requests.get(url, headers=headers)
         rep.encoding = 'utf-8'
         html = rep.text
         self.html = html
         return html
 
-    def parseTocResume(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
-        resume = soup.find("div", id="novel_ex")
-        # resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
+    def processTocResume(self,html):
+        resume = self.parseTocResume(html)
         if (resume is None):
             print("the novel has most likely been terminated")
         else:
             # self.cleanText(resume)
             string = 'novel title= ' + self.getNovelTitle(html) + '\n\n'
             resume.insert(0, string)
             self.createFile(0, 'TOC', resume)
+
+    def parseTocResume(self, html):
+        soup = BeautifulSoup(html, 'html.parser').find("h1")
+        print("soup ",soup.text)
+        return soup.text if soup else ""
+        # resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
+
 
     def getChapter(self, chapter_num):
         chapter = SyosetuChapter(self.code, chapter_num)
@@ -396,33 +406,8 @@ def validateTitle(self, title):
         return new_title
 
     def parseTitle(self, TocHTML) -> str:
-        # testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
-        writer = re.match(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
-        title = writer if writer else '' 
-        return title
-
-
-
-
-def test():
-    import os
-
-    x = Novel('n6912eh', 'My Skills Are Too Strong to Be a Heroine')
-
-    x = x.updateObject()
-    x.setLastChapter(0)
-    print(x)
-    name = x.titre
-    print(name)
-    path = './novel_list/' + x.code + ' ' + name
-    print(path)
-
-    print("dir=  " + path)
-    # dir='./novel_list/'+code+' '+name
-    x.setDir(path)
-    x.setLastChapter(145)
-    x.processNovel()
-
+        testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
+        return testTitle.text
 
 class KakuyomuNovel(Novel):
     def __init__(self, code, title, keep_text_format):
@@ -527,7 +512,7 @@ def processNovel(self):
         if (self.getLastChapter() == 0):
             self.processTocResume(html)
         # get the number of chapters (solely for user feedback)
-        online_chapter_list = re.findall('href="/' + self.code + '/(\d+)/"', html)
+        online_chapter_list = self.parseOnlineChapterList(html)
 
         lastDL = self.getLastChapter()
         online_chapter_list = online_chapter_list[lastDL:]
@@ -538,9 +523,25 @@ def processNovel(self):
             chap = self.processChapter(int(chapter_num))
             chap.createFile(self.dir + '/')
 
+    def parseOnlineChapterList(self, html):
+        online_chapter_list = []
+        done = False
+        if html == '':
+            html = self.html
+        while not done:
+            soup = BeautifulSoup(html,"html.parser")
+            online_chapter_list += re.findall('href="/' + self.code + '/(\d+)/"', html)
+            nextPage =  soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
+            nextPage =  list(filter( lambda x: "c-pager__item--next" in  x['class'], nextPage))
+            if nextPage :
+                nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
+                html = self.fetchTOCPage(int(nextPageNum))
+            else:
+                done = True
+        return  online_chapter_list
+
     def processTocResume(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
-        resume = soup.find("div", id="novel_ex")
+        resume = self.parseTocResume(html)
         # resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
         if (resume is None):
             print("the novel has most likely been terminated")
@@ -549,7 +550,11 @@ def processTocResume(self, html):
             string = 'novel title= ' + self.getNovelTitle(html) + '\n\n'
             resume.insert(0, string)
             self.createFile(0, 'TOC', resume)
-
+
+    def parseTocResume(self, html ):
+        soup = BeautifulSoup(html, 'html.parser')
+        return soup.find("div", id="novel_ex")
+
     def processChapter(self, chapter_num):
         chapter = N18SyosetuChapter(self.code, chapter_num)
         chapter_html = self.connectViaMechanize(
@@ -571,9 +576,9 @@ def getNovelTitle(self, html=''):
 
         if (html == ''):
             html = self.connectViaMechanize(url)
-        writer = re.findall(r'<p class="novel_title">(.*?)</p>', html)
+        writer = BeautifulSoup( html,"html.parser").find('h1')
         # print(writer)
-        return writer[0]
+        return writer.text if writer else ''
 
     def __createFile__(self, chapterNumber, chapter_title, chapter_content):
         chapter_title = checkFileName(chapter_title)
@@ -586,8 +591,11 @@ def __createFile__(self, chapterNumber, chapter_title, chapter_content):
         print('\n\n')
 
 
-    def fetchTOCPage(self):
-        return self.connectViaMechanize(self.url)
+    def fetchTOCPage(self, page=0):
+        url = self.url
+        if page > 0:
+            url += "?p="+ str(page)
+        return self.connectViaMechanize(url)
 
     def connectViaMechanize(self, url):
         import http.cookiejar as cookielib

diff --git a/src/main_functions.py b/src/main_functions.py
@@ -2,7 +2,7 @@
 import re
 import logging
 from typing import List
-
+import traceback
 from src.Downloaders import *
 import zipfile
 
@@ -18,8 +18,7 @@
 def archiveUpdate(dirList=[],keep_text_format=False):
     if not dirList:
         dirList=os.listdir('./novel_list')
-    print("list=")
-    print(dirList)
+    print("novel folders found =", dirList)
 
     for novel_folder in dirList:
         try:
@@ -49,8 +48,8 @@ def archiveUpdate(dirList=[],keep_text_format=False):
             novel.setDir('./novel_list/'+novel_folder)
             print(type(novel))
             novel.processNovel()
-        except BaseException as error:
-            print(error)
+        except BaseException:
+            print(traceback.format_exc())
             print('An error happened while updating the folder', novel_folder)
 
 
@@ -108,8 +107,6 @@ def archiveFullUpdate(dirList=[],force=False):
         #let's update the archive
         novel.processNovel()
 
-
-
 def getInputFile() -> List[str]:
     """return code and novel name from input.txt"""
     inputfile=open('input.txt','r+', encoding='utf-8')
@@ -149,7 +146,6 @@ def download(keep_text_format=False):
         title=novel_info[1]
         #print('i '+title)
 
-        print('Working on:', code, 'Title:', title, 'Keep Format:', keep_text_format)
         #novel=Novel(code,name,keep_text_format)
         #novel=novel.updateObject()
         novel = factory.getNovel(code, title, keep_text_format)
@@ -195,7 +191,6 @@ def download(keep_text_format=False):
             print("novel ",code," hasn't been downloaded" )
             raise(err)
 
-
 def download_cli(userInput:str):
     novel_info = userInput.strip().split(';')
     if(len(novel_info)<2):

diff --git a/tests/tests.py b/tests/tests.py
@@ -0,0 +1,22 @@
+import os
+import sys
+import unittest
+
+sys.path.insert(0, '..')
+sys.path.append('..\src')
+sys.path.append('src')
+
+from main_functions import findNovel
+from Downloaders import *
+from archive_updater import check_env
+
+class TestMainFunctions(unittest.TestCase):
+
+    def test_find_novel_folder(self):
+        self.assertTrue(len(findNovel("")) == len(os.listdir('novel_list')))
+
+if __name__ == '__main__':
+    os.mkdir('novel_list')
+    unittest.main()
+    # runner = unittest.TextTestRunner()
+    # runner.run(suite())
diff --git a/tests/tests_builder.py b/tests/tests_builder.py
@@ -0,0 +1,33 @@
+import os
+import sys
+import unittest
+
+sys.path.insert(0, '..')
+sys.path.append('..\src')
+sys.path.append('src')
+
+from main_functions import findNovel
+from Downloaders import *
+from archive_updater import check_env
+
+
+class Test_factory(unittest.TestCase):
+
+    def setUp(self):
+        global factory 
+        factory= NovelFactory()
+        factory.registerObject(SyosetuNovel)
+        factory.registerObject(N18SyosetuNovel)
+        factory.registerObject(KakuyomuNovel)
+
+    def test_builder_n18(self):
+        x= factory.getNovel('n18n2935bp', 'memory rewrite ')
+        self.assertTrue(x.__class__ == N18SyosetuNovel )
+
+    def test_builder_syosetu(self):
+        novel=factory.getNovel('n5080fi')
+        self.assertTrue(novel.__class__ == SyosetuNovel)
+
+    def test_builder_kakuyomu(self):
+        novel=factory.getNovel('16816452220453312822')
+        self.assertTrue(novel.__class__ == KakuyomuNovel)
diff --git a/tests/tests_n18.py b/tests/tests_n18.py
@@ -0,0 +1,35 @@
+import os
+import unittest
+from Downloaders import *
+
+class Test_n18(unittest.TestCase):
+
+    def setUp(self):
+        global novel
+        novel=N18SyosetuNovel('n18n6426w',"test", False)
+
+    def test_parse_TOC(self):
+        html = novel.fetchTOCPage()
+        self.assertTrue(html != '',"failed to fetch the html page")
+        toc = novel.parseTocResume(html)
+        self.assertTrue(toc != '',"failed to parse the TOC resume")
+
+    def test_parse_chapter_list(self):
+        html = novel.fetchTOCPage()
+        chapter_list = novel.parseOnlineChapterList(html)
+        self.assertTrue(len(chapter_list) != 0)
+        for chap_num in chapter_list:
+            with self.subTest(i=chap_num):
+                self.assertIsNotNone(int(chap_num))
+                self.assertTrue(int(chap_num) > 0 )
+
+    def test_parse_chapter_content(self):
+        chapter = novel.processChapter(1)
+        self.assertTrue(chapter.content !="")
+        self.assertTrue(chapter.title != "")
+
+if __name__ == '__main__':
+    os.mkdir('novel_list')
+    unittest.main()
+    # runner = unittest.TextTestRunner()
+    # runner.run(suite())
diff --git a/tests/tests_syosetu.py b/tests/tests_syosetu.py
@@ -0,0 +1,30 @@
+import unittest
+from Downloaders import *
+
+class Tests_syosetu(unittest.TestCase):
+
+
+    def setUp(self):
+        global novel
+        novel= SyosetuNovel('n7671do', "test", False)
+
+    def test_parse_TOC(self):
+        html = novel.fetchTOCPage()
+        self.assertTrue(html != '',"failed to fetch the html page")
+        toc_resume = novel.parseTocResume(html)
+        self.assertTrue(toc_resume != '',"failed to parse the TOC resume")
+        print("resume = ",toc_resume)
+
+    def test_parse_chapter_list(self):
+        html = novel.fetchTOCPage()
+        chapter_list = novel.parseOnlineChapterList(html)
+        self.assertTrue(len(chapter_list) != 0)
+        for chap_num in chapter_list:
+            with self.subTest(i=chap_num):
+                self.assertIsNotNone(int(chap_num))
+                self.assertTrue(int(chap_num) > 0 )
+
+    # def test_parse_chapter_content(self):
+    #     chapter = novel.parse([1])
+    #     self.assertTrue(chapter.content !="")
+    #     self.assertTrue(chapter.title != "")