Skip to content

Commit

Permalink
fix syosetu novel title parser
Browse files Browse the repository at this point in the history
  • Loading branch information
safirex committed Dec 30, 2024
1 parent e160e97 commit f39fb2f
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 54 deletions.
98 changes: 53 additions & 45 deletions src/Downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def parseOnlineChapterList(self, html) -> list:
"""parse the list of chapters from the HTML of the TOC page"""
raise("parseOnlineChapterList method is not defined")

def parseTocResume(self, html=''):
def parseTocResume(self, html='') -> str:
""" format and interpret the content of the home page of the novel """
warnings.warn("This class doesn't have a method to parse the table of content's resume.")

Expand All @@ -246,7 +246,10 @@ def processNovel(self):
online_chapter_list = self.parseOnlineChapterList(html)
if (self.getLastChapter() == 0):
resumeContent = self.parseTocResume(html)
# self.save("0_TOC",resumeContent)
print('content of resume', resumeContent)

if resumeContent:
self.createFile(0,"TOC", resumeContent)
if (len(online_chapter_list) >= 1):

# get the chapters url
Expand All @@ -258,9 +261,10 @@ def processNovel(self):
# will add new files for every revised chapters
self.updatePerDate(html)
else:
print("No chapters were found")
print("this web novel has most likely been terminated")

def processChapter(self, chapList):
def processChapter(self, chapList:list):
""" download every chapter of the list """
for chapter_num in chapList:
chap = self.getChapter(chapter_num)
Expand Down Expand Up @@ -328,21 +332,21 @@ def updatePerDate(self, html):
print("fin update")

def parseOnlineChapterList(self, html='') -> list:
online_chapter_list = []
chap_num_list = []
done = False
if html == '':
html = self.html
while not done:
soup = BeautifulSoup(html,"html.parser")
online_chapter_list += soup.findAll(href=re.compile('/' + self.code + '/(?!\?p=)\d' ))
chap_num_list += re.findall('href="/' + self.code + '/(\d+)/"' ,html)
nextPage = soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
nextPage = list(filter( lambda x: "c-pager__item--next" in x['class'], nextPage))
if nextPage :
nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
html = self.fetchTOCPage(nextPageNum)
else:
done = True
chap_num_list = list(map(lambda x: re.split('/',x['href'][:-1])[-1], online_chapter_list))

if (chap_num_list is None or len(chap_num_list) == 0):
print("the novel has most likely been terminated\n")
return chap_num_list
Expand All @@ -354,23 +358,29 @@ def fetchTOCPage(self, page=0):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
print("fetching ",url)
rep = requests.get(url, headers=headers)
rep.encoding = 'utf-8'
html = rep.text
self.html = html
return html

def parseTocResume(self, html):
soup = BeautifulSoup(html, 'html.parser')
resume = soup.find("div", id="novel_ex")
# resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
def processTocResume(self,html):
resume = self.parseTocResume(html)
if (resume is None):
print("the novel has most likely been terminated")
else:
# self.cleanText(resume)
string = 'novel title= ' + self.getNovelTitle(html) + '\n\n'
resume.insert(0, string)
self.createFile(0, 'TOC', resume)

def parseTocResume(self, html):
soup = BeautifulSoup(html, 'html.parser').find("h1")
print("soup ",soup.text)
return soup.text if soup else ""
# resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]


def getChapter(self, chapter_num):
chapter = SyosetuChapter(self.code, chapter_num)
Expand All @@ -396,33 +406,8 @@ def validateTitle(self, title):
return new_title

def parseTitle(self, TocHTML) -> str:
# testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
writer = re.match(r'<p class="novel_title">(.*?)</p>', TocHTML, re.S)
title = writer if writer else ''
return title




def test():
import os

x = Novel('n6912eh', 'My Skills Are Too Strong to Be a Heroine')

x = x.updateObject()
x.setLastChapter(0)
print(x)
name = x.titre
print(name)
path = './novel_list/' + x.code + ' ' + name
print(path)

print("dir= " + path)
# dir='./novel_list/'+code+' '+name
x.setDir(path)
x.setLastChapter(145)
x.processNovel()

testTitle = BeautifulSoup(TocHTML, 'html').find('h1')
return testTitle.text

class KakuyomuNovel(Novel):
def __init__(self, code, title, keep_text_format):
Expand Down Expand Up @@ -527,7 +512,7 @@ def processNovel(self):
if (self.getLastChapter() == 0):
self.processTocResume(html)
# get the number of chapters (solely for user feedback)
online_chapter_list = re.findall('href="/' + self.code + '/(\d+)/"', html)
online_chapter_list = self.parseOnlineChapterList(html)

lastDL = self.getLastChapter()
online_chapter_list = online_chapter_list[lastDL:]
Expand All @@ -538,9 +523,25 @@ def processNovel(self):
chap = self.processChapter(int(chapter_num))
chap.createFile(self.dir + '/')

def parseOnlineChapterList(self, html):
online_chapter_list = []
done = False
if html == '':
html = self.html
while not done:
soup = BeautifulSoup(html,"html.parser")
online_chapter_list += re.findall('href="/' + self.code + '/(\d+)/"', html)
nextPage = soup.findAll(href=re.compile('/' + self.code + '/\?p=.' ))
nextPage = list(filter( lambda x: "c-pager__item--next" in x['class'], nextPage))
if nextPage :
nextPageNum = re.split('\?p=',nextPage[0]['href'])[1]
html = self.fetchTOCPage(int(nextPageNum))
else:
done = True
return online_chapter_list

def processTocResume(self, html):
soup = BeautifulSoup(html, 'html.parser')
resume = soup.find("div", id="novel_ex")
resume = self.parseTocResume(html)
# resume=re.findall('<div id="novel_ex">'+'(.*?)'+'</div>',html,re.S)[0]
if (resume is None):
print("the novel has most likely been terminated")
Expand All @@ -549,7 +550,11 @@ def processTocResume(self, html):
string = 'novel title= ' + self.getNovelTitle(html) + '\n\n'
resume.insert(0, string)
self.createFile(0, 'TOC', resume)


def parseTocResume(self, html ):
soup = BeautifulSoup(html, 'html.parser')
return soup.find("div", id="novel_ex")

def processChapter(self, chapter_num):
chapter = N18SyosetuChapter(self.code, chapter_num)
chapter_html = self.connectViaMechanize(
Expand All @@ -571,9 +576,9 @@ def getNovelTitle(self, html=''):

if (html == ''):
html = self.connectViaMechanize(url)
writer = re.findall(r'<p class="novel_title">(.*?)</p>', html)
writer = BeautifulSoup( html,"html.parser").find('h1')
# print(writer)
return writer[0]
return writer.text if writer else ''

def __createFile__(self, chapterNumber, chapter_title, chapter_content):
chapter_title = checkFileName(chapter_title)
Expand All @@ -586,8 +591,11 @@ def __createFile__(self, chapterNumber, chapter_title, chapter_content):
print('\n\n')


def fetchTOCPage(self):
return self.connectViaMechanize(self.url)
def fetchTOCPage(self, page=0):
url = self.url
if page > 0:
url += "?p="+ str(page)
return self.connectViaMechanize(url)

def connectViaMechanize(self, url):
import http.cookiejar as cookielib
Expand Down
13 changes: 4 additions & 9 deletions src/main_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import logging
from typing import List

import traceback
from src.Downloaders import *
import zipfile

Expand All @@ -18,8 +18,7 @@
def archiveUpdate(dirList=[],keep_text_format=False):
if not dirList:
dirList=os.listdir('./novel_list')
print("list=")
print(dirList)
print("novel folders found =", dirList)

for novel_folder in dirList:
try:
Expand Down Expand Up @@ -49,8 +48,8 @@ def archiveUpdate(dirList=[],keep_text_format=False):
novel.setDir('./novel_list/'+novel_folder)
print(type(novel))
novel.processNovel()
except BaseException as error:
print(error)
except BaseException:
print(traceback.format_exc())
print('An error happened while updating the folder', novel_folder)


Expand Down Expand Up @@ -108,8 +107,6 @@ def archiveFullUpdate(dirList=[],force=False):
#let's update the archive
novel.processNovel()



def getInputFile() -> List[str]:
"""return code and novel name from input.txt"""
inputfile=open('input.txt','r+', encoding='utf-8')
Expand Down Expand Up @@ -149,7 +146,6 @@ def download(keep_text_format=False):
title=novel_info[1]
#print('i '+title)

print('Working on:', code, 'Title:', title, 'Keep Format:', keep_text_format)
#novel=Novel(code,name,keep_text_format)
#novel=novel.updateObject()
novel = factory.getNovel(code, title, keep_text_format)
Expand Down Expand Up @@ -195,7 +191,6 @@ def download(keep_text_format=False):
print("novel ",code," hasn't been downloaded" )
raise(err)


def download_cli(userInput:str):
novel_info = userInput.strip().split(';')
if(len(novel_info)<2):
Expand Down
22 changes: 22 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import sys
import unittest

sys.path.insert(0, '..')
sys.path.append('..\src')
sys.path.append('src')

from main_functions import findNovel
from Downloaders import *
from archive_updater import check_env

class TestMainFunctions(unittest.TestCase):

def test_find_novel_folder(self):
self.assertTrue(len(findNovel("")) == len(os.listdir('novel_list')))

if __name__ == '__main__':
os.mkdir('novel_list')
unittest.main()
# runner = unittest.TextTestRunner()
# runner.run(suite())
33 changes: 33 additions & 0 deletions tests/tests_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import sys
import unittest

sys.path.insert(0, '..')
sys.path.append('..\src')
sys.path.append('src')

from main_functions import findNovel
from Downloaders import *
from archive_updater import check_env


class Test_factory(unittest.TestCase):

def setUp(self):
global factory
factory= NovelFactory()
factory.registerObject(SyosetuNovel)
factory.registerObject(N18SyosetuNovel)
factory.registerObject(KakuyomuNovel)

def test_builder_n18(self):
x= factory.getNovel('n18n2935bp', 'memory rewrite ')
self.assertTrue(x.__class__ == N18SyosetuNovel )

def test_builder_syosetu(self):
novel=factory.getNovel('n5080fi')
self.assertTrue(novel.__class__ == SyosetuNovel)

def test_builder_kakuyomu(self):
novel=factory.getNovel('16816452220453312822')
self.assertTrue(novel.__class__ == KakuyomuNovel)
35 changes: 35 additions & 0 deletions tests/tests_n18.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import unittest
from Downloaders import *

class Test_n18(unittest.TestCase):

def setUp(self):
global novel
novel=N18SyosetuNovel('n18n6426w',"test", False)

def test_parse_TOC(self):
html = novel.fetchTOCPage()
self.assertTrue(html != '',"failed to fetch the html page")
toc = novel.parseTocResume(html)
self.assertTrue(toc != '',"failed to parse the TOC resume")

def test_parse_chapter_list(self):
html = novel.fetchTOCPage()
chapter_list = novel.parseOnlineChapterList(html)
self.assertTrue(len(chapter_list) != 0)
for chap_num in chapter_list:
with self.subTest(i=chap_num):
self.assertIsNotNone(int(chap_num))
self.assertTrue(int(chap_num) > 0 )

def test_parse_chapter_content(self):
chapter = novel.processChapter(1)
self.assertTrue(chapter.content !="")
self.assertTrue(chapter.title != "")

if __name__ == '__main__':
os.mkdir('novel_list')
unittest.main()
# runner = unittest.TextTestRunner()
# runner.run(suite())
30 changes: 30 additions & 0 deletions tests/tests_syosetu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import unittest
from Downloaders import *

class Tests_syosetu(unittest.TestCase):


def setUp(self):
global novel
novel= SyosetuNovel('n7671do', "test", False)

def test_parse_TOC(self):
html = novel.fetchTOCPage()
self.assertTrue(html != '',"failed to fetch the html page")
toc_resume = novel.parseTocResume(html)
self.assertTrue(toc_resume != '',"failed to parse the TOC resume")
print("resume = ",toc_resume)

def test_parse_chapter_list(self):
html = novel.fetchTOCPage()
chapter_list = novel.parseOnlineChapterList(html)
self.assertTrue(len(chapter_list) != 0)
for chap_num in chapter_list:
with self.subTest(i=chap_num):
self.assertIsNotNone(int(chap_num))
self.assertTrue(int(chap_num) > 0 )

# def test_parse_chapter_content(self):
# chapter = novel.parse([1])
# self.assertTrue(chapter.content !="")
# self.assertTrue(chapter.title != "")

0 comments on commit f39fb2f

Please sign in to comment.