Skip to content

Commit

Permalink
fix syosetu parser for pagination & catch execption on
Browse files Browse the repository at this point in the history
  • Loading branch information
safirex committed Dec 29, 2024
1 parent c4a8b1d commit a66d1d3
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 48 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
note: ripping is bad, don't do it
note: ripping is bad, don't do it
<span style="color:orange"> the kakuyomu parser doesn't work anymore since the website update of 2024/23`
</span>
# WNovelArchiver
A simple python script to easily download and keep up to date raw web-novels on syosetu and kakuyomu
If you have another WN site (JP/CN/KR/...) which you would like to be usable, feel free to put an issue.
Expand Down
19 changes: 10 additions & 9 deletions src/Chapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ def cleanText(self,chapter_content):

def createFile(self, path):
chapter_title=checkFileName(self.title)
print("titre"+chapter_title)
print('saving '+str(self.num)+' '+chapter_title)
print('Saving chapter', self.num, chapter_title)

file = open('%s/%s_%s.txt'%(path,self.num,chapter_title), 'w+', encoding='utf-8')
file.write(chapter_title+'\n')
Expand Down Expand Up @@ -126,18 +125,20 @@ def __init__(self,novelNum,num):

def setUrl(self):
self.url='https://ncode.syosetu.com/%s/%s/'%(self.novelNum,self.num)

def parseTitle(self, html) -> str:
soup = BeautifulSoup(html, 'html.parser')
title = soup.find("p","novel_subtitle").text
titlediv = soup.find("h1")
title = titlediv.text if titlediv else ""
return title

def parseContent(self,html):
chapter_content=re.findall(r'<div id="novel_honbun" class="novel_view">(.*?)</div>',html,re.S)[0]
replacething=re.findall(r'<p id=' + '.*?' + '>', chapter_content)
for y in replacething:
chapter_content=chapter_content.replace(y,'')
chapter_content=self.cleanText(chapter_content)
soup = BeautifulSoup(html, 'html.parser')
contentDiv = soup.find('div',"p-novel__body")
if contentDiv :
chapter_content = contentDiv.text
else:
raise BaseException("couldn't retrieve the content of the chapter")
self.setContent(chapter_content)
return chapter_content

Expand Down
40 changes: 27 additions & 13 deletions src/Downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def processNovel(self):
print("novel " + self.titre)
print('last chapter: ' + str(self.getLastChapter()))
try:
html = self.fetchTOCPage();
html = self.fetchTOCPage()
except requests.HTTPError :
print("can't acces the novel TOC page")
return ''
Expand All @@ -263,8 +263,8 @@ def processNovel(self):
def processChapter(self, chapList):
""" download every chapter of the list """
for chapter_num in chapList:
chap = self.getChapter(chapter_num)
chap.createFile(self.dir + '/')
chap = self.getChapter(chapter_num)
chap.createFile(self.dir + '/')
pass

def getChapter(self,chapter_num) ->Chapter:
Expand Down Expand Up @@ -328,16 +328,29 @@ def updatePerDate(self, html):
print("fin update")

def parseOnlineChapterList(self, html='') -> list:
online_chapter_list = []
done = False
if html == '':
html = self.html
online_chapter_list = re.findall(
r'<a href="/' + self.code + '/' + '(.*?)' + '/">.*?</a>', html, re.S)
if (online_chapter_list is None or len(online_chapter_list) == 0):
while not done:
soup = BeautifulSoup(html)
online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
nextPage = soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
nextPage = list(filter( lambda x: "c-pager__item--next" in x['class'], nextPage))
if nextPage :
nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
html = self.fetchTOCPage(nextPageNum)
else:
done = True
chap_num_list = list(map(lambda x: re.split('/',x['href'][:-1])[-1], online_chapter_list))
if (chap_num_list is None or len(chap_num_list) == 0):
print("the novel has most likely been terminated\n")
return online_chapter_list

def fetchTOCPage(self):
return chap_num_list
def fetchTOCPage(self, page=0):
url = self.url
if page !=0:
url += "?p="+page
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
Expand Down Expand Up @@ -381,11 +394,12 @@ def validateTitle(self, title):
rstr = r"[\/\\\:\*\?\"\<\>\|]"
new_title = re.sub(rstr, "_", title)
return new_title

def parseTitle(self, TocHTML) -> str:

writer = re.findall(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
print('title = '+str(writer))
return writer[0]
# testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
writer = re.match(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
title = writer if writer else ''
return title



Expand Down
54 changes: 29 additions & 25 deletions src/main_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,36 @@ def archiveUpdate(dirList=[],keep_text_format=False):
print(dirList)

for novel_folder in dirList:
print()
novelInfo=getNovelInfoFromFolderName(novel_folder)
#change the fetching process following the site it's hosted on
novel = factory.getNovel(novelInfo[1],novelInfo[0], keep_text_format)
#novel=Novel(novelInfo[1],novelInfo[0],keep_text_format)
#novel=novel.updateObject()
if(novel==0):
print(novel_folder+' couldnt be updated because the code doesnt match known formats')
continue

#now we fetch the local chapters and determine the last chapter stored
chapter_list=os.listdir('./novel_list/%s'%novel_folder)
last_downloaded=0
for chap in chapter_list:
n=chap.find('_')
tmp=chap[:n]
tmp=int(tmp)
if(last_downloaded<tmp):
last_downloaded=tmp
novel.setLastChapter(last_downloaded)
#now that we have the number of the last chapter and the novel code
try:
print()
novelInfo=getNovelInfoFromFolderName(novel_folder)
#change the fetching process following the site it's hosted on
novel = factory.getNovel(novelInfo[1],novelInfo[0], keep_text_format)
#novel=Novel(novelInfo[1],novelInfo[0],keep_text_format)
#novel=novel.updateObject()
if(novel==0):
print(novel_folder+' couldnt be updated because the code doesnt match known formats')
continue

#let's update the archive
novel.setDir('./novel_list/'+novel_folder)
print(type(novel))
novel.processNovel()
#now we fetch the local chapters and determine the last chapter stored
chapter_list=os.listdir('./novel_list/%s'%novel_folder)
last_downloaded=0
for chap in chapter_list:
n=chap.find('_')
tmp=chap[:n]
tmp=int(tmp)
if(last_downloaded<tmp):
last_downloaded=tmp
novel.setLastChapter(last_downloaded)
#now that we have the number of the last chapter and the novel code

#let's update the archive
novel.setDir('./novel_list/'+novel_folder)
print(type(novel))
novel.processNovel()
except BaseException as error:
print(error)
print('An error happened while updating the folder', novel_folder)


def archiveFullUpdate(dirList=[],force=False):
Expand Down

0 comments on commit a66d1d3

Please sign in to comment.