fix syosetu parser for pagination & catch execption on

safirex · Dec 29, 2024 · a66d1d3 · a66d1d3
1 parent c4a8b1d
commit a66d1d3
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
-note: ripping is bad, don't do it
+note: ripping is bad, don't do it   
+<span style="color:orange"> the kakuyomu parser doesn't work anymore since the website update of 2024/23`
+</span>
 # WNovelArchiver
 A simple python script to easily download and keep up to date raw web-novels on syosetu and kakuyomu  
 If you have another WN site (JP/CN/KR/...) which you would like to be usable, feel free to put an issue.  

diff --git a/src/Chapters.py b/src/Chapters.py
@@ -81,8 +81,7 @@ def cleanText(self,chapter_content):
 
     def createFile(self, path):
         chapter_title=checkFileName(self.title)
-        print("titre"+chapter_title)
-        print('saving '+str(self.num)+' '+chapter_title)
+        print('Saving chapter', self.num, chapter_title)
 
         file = open('%s/%s_%s.txt'%(path,self.num,chapter_title), 'w+', encoding='utf-8')
         file.write(chapter_title+'\n')
@@ -126,18 +125,20 @@ def __init__(self,novelNum,num):
 
     def setUrl(self):
         self.url='https://ncode.syosetu.com/%s/%s/'%(self.novelNum,self.num)
-
+        
     def parseTitle(self, html) -> str:
         soup = BeautifulSoup(html, 'html.parser')
-        title = soup.find("p","novel_subtitle").text
+        titlediv = soup.find("h1")
+        title = titlediv.text if titlediv else ""
         return title
 
     def parseContent(self,html):
-        chapter_content=re.findall(r'<div id="novel_honbun" class="novel_view">(.*?)</div>',html,re.S)[0]
-        replacething=re.findall(r'<p id=' + '.*?' + '>', chapter_content)
-        for y in replacething:
-            chapter_content=chapter_content.replace(y,'')
-        chapter_content=self.cleanText(chapter_content)
+        soup = BeautifulSoup(html, 'html.parser')
+        contentDiv = soup.find('div',"p-novel__body")
+        if contentDiv : 
+            chapter_content = contentDiv.text
+        else:
+            raise BaseException("couldn't retrieve the content of the chapter")
         self.setContent(chapter_content)
         return chapter_content
 

diff --git a/src/Downloaders.py b/src/Downloaders.py
@@ -238,7 +238,7 @@ def processNovel(self):
         print("novel " + self.titre)
         print('last chapter: ' + str(self.getLastChapter()))
         try:
-            html = self.fetchTOCPage();
+            html = self.fetchTOCPage()
         except  requests.HTTPError :
             print("can't acces the novel TOC page")
             return ''
@@ -263,8 +263,8 @@ def processNovel(self):
     def processChapter(self, chapList):
         """ download every chapter of the list """
         for chapter_num in chapList:
-                chap = self.getChapter(chapter_num)
-                chap.createFile(self.dir + '/')
+            chap = self.getChapter(chapter_num)
+            chap.createFile(self.dir + '/')
         pass
 
     def getChapter(self,chapter_num) ->Chapter:
@@ -328,16 +328,29 @@ def updatePerDate(self, html):
         print("fin update")
 
     def parseOnlineChapterList(self, html='') -> list:
+        online_chapter_list = []
+        done = False
         if html == '':
             html = self.html
-        online_chapter_list = re.findall(
-            r'<a href="/' + self.code + '/' + '(.*?)' + '/">.*?</a>', html, re.S)
-        if (online_chapter_list is None or len(online_chapter_list) == 0):
+        while not done:
+            soup = BeautifulSoup(html)
+            online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
+            nextPage =  soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
+            nextPage =  list(filter( lambda x: "c-pager__item--next" in  x['class'], nextPage))
+            if nextPage :
+                nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
+                html = self.fetchTOCPage(nextPageNum)
+            else:
+                done = True
+        chap_num_list = list(map(lambda x: re.split('/',x['href'][:-1])[-1], online_chapter_list))
+        if (chap_num_list is None or len(chap_num_list) == 0):
             print("the novel has most likely been terminated\n")
-        return online_chapter_list
-
-    def fetchTOCPage(self):
+        return chap_num_list
+    
+    def fetchTOCPage(self, page=0):
         url = self.url
+        if page !=0:
+            url += "?p="+page
         headers = {
             "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
         }
@@ -381,11 +394,12 @@ def validateTitle(self, title):
         rstr = r"[\/\\\:\*\?\"\<\>\|]"
         new_title = re.sub(rstr, "_", title)
         return new_title
+
     def parseTitle(self, TocHTML) -> str:
-
-        writer = re.findall(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
-        print('title = '+str(writer))
-        return writer[0]
+        # testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
+        writer = re.match(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
+        title = writer if writer else '' 
+        return title
 
 
 

diff --git a/src/main_functions.py b/src/main_functions.py
@@ -22,32 +22,36 @@ def archiveUpdate(dirList=[],keep_text_format=False):
     print(dirList)
 
     for novel_folder in dirList:
-        print()
-        novelInfo=getNovelInfoFromFolderName(novel_folder)
-        #change the fetching process following the site it's hosted on
-        novel = factory.getNovel(novelInfo[1],novelInfo[0], keep_text_format)
-        #novel=Novel(novelInfo[1],novelInfo[0],keep_text_format)
-        #novel=novel.updateObject()
-        if(novel==0):
-            print(novel_folder+' couldnt be updated because the code doesnt match known formats')
-            continue
-
-        #now we fetch the local chapters and determine the last chapter stored
-        chapter_list=os.listdir('./novel_list/%s'%novel_folder)
-        last_downloaded=0
-        for chap in chapter_list:
-            n=chap.find('_')
-            tmp=chap[:n]
-            tmp=int(tmp)
-            if(last_downloaded<tmp):
-                last_downloaded=tmp
-        novel.setLastChapter(last_downloaded)
-        #now that we have the number of the last chapter and the novel code
+        try:
+            print()
+            novelInfo=getNovelInfoFromFolderName(novel_folder)
+            #change the fetching process following the site it's hosted on
+            novel = factory.getNovel(novelInfo[1],novelInfo[0], keep_text_format)
+            #novel=Novel(novelInfo[1],novelInfo[0],keep_text_format)
+            #novel=novel.updateObject()
+            if(novel==0):
+                print(novel_folder+' couldnt be updated because the code doesnt match known formats')
+                continue
 
-        #let's update the archive
-        novel.setDir('./novel_list/'+novel_folder)
-        print(type(novel))
-        novel.processNovel()
+            #now we fetch the local chapters and determine the last chapter stored
+            chapter_list=os.listdir('./novel_list/%s'%novel_folder)
+            last_downloaded=0
+            for chap in chapter_list:
+                n=chap.find('_')
+                tmp=chap[:n]
+                tmp=int(tmp)
+                if(last_downloaded<tmp):
+                    last_downloaded=tmp
+            novel.setLastChapter(last_downloaded)
+            #now that we have the number of the last chapter and the novel code
+
+            #let's update the archive
+            novel.setDir('./novel_list/'+novel_folder)
+            print(type(novel))
+            novel.processNovel()
+        except BaseException as error:
+            print(error)
+            print('An error happened while updating the folder', novel_folder)
 
 
 def archiveFullUpdate(dirList=[],force=False):