Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load data fix and table of content generation #76

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ Please watch the [demonstration video](https://youtu.be/X4msqCulOYk).

You'll need to find the *authToken*, *bookId* and *reCaptchaToken* analyzing the browser/websocket traffic and replace the constants in downloader.py.

UPDATE :

You'll need to add "data" value as well by analyzing the browser/websocket from the "loadData" event for all chapters in TOKEN_LIST

## Run!
>$ python3 downloader.py

Expand Down
70 changes: 62 additions & 8 deletions downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import ssl
import re
import os

import fitz
import requests
import websocket
from pyppeteer import launch
Expand All @@ -22,6 +22,15 @@

PUPPETEER_THREADS = 50


TOKEN_LIST = [
"CHAPTER_1_Token",
"CHAPTER_2_Token"
#Add tokens for all chapters here
]

tokenIndex = 1

def init_book_delivery():
while True:
try:
Expand Down Expand Up @@ -110,7 +119,7 @@ def __init__(self):
else:
raise Exception(f'unknown book format ({book_format})!')

ws.send(json.dumps({"action":"loadPage","data":{"authToken": AUTH_TOKEN, "pageId": list(chapters)[0], "bookType": book_format, "windowWidth":1792, "mergedChapterPartIndex":0}}))
ws.send(json.dumps({"action":"loadPage","data":TOKEN_LIST[0]}))


elif 'pageChunk' in data['event']:
Expand Down Expand Up @@ -153,7 +162,11 @@ def __init__(self):
merged_chapter_part_idx += 1
next_page = page_id

ws.send(json.dumps({"action":"loadPage","data":{"authToken": AUTH_TOKEN, "pageId": str(next_page), "bookType": book_format, "windowWidth":1792, "mergedChapterPartIndex":merged_chapter_part_idx}}))
ws.send(json.dumps({"action":"loadPage","data":TOKEN_LIST[tokenIndex+1]}))
tokenIndex = tokenIndex + 1
if (tokenIndex+1 == len(TOKEN_LIST)):
break


break

Expand Down Expand Up @@ -209,8 +222,10 @@ async def render_page(chapter_no, semaphore):

# remove useless img (mess up with pdf gen)
if book_format == 'EPUB':
match = re.search('<img id="trigger" data-chapterid="[0-9]*?" src="" onerror="LoadChapter\(\'[0-9]*?\'\)" />', content).group(0)
if match: content = content.replace(match, '')
match = re.search('<img id="trigger" data-chapterid="[0-9]*?" src="" onerror="LoadChapter\(\'[0-9]*?\'\)" />', content)
if match:
content = content.replace(match.group(0), '')


# reveal hidden images
imgs = re.findall("<img.*?>", content, re.S)
Expand All @@ -234,13 +249,12 @@ async def render_page(chapter_no, semaphore):
options['width'] = width
options['height'] = height
elif book_format == 'EPUB':
options['margin'] = {'top': '20', 'bottom': '20', 'left': '20', 'right': '20'}
options['margin'] = {'top': '20', 'bottom': '20', 'left': '40', 'right': '40'}

# build pdf
await page.pdf(options)
await page.close()

print(f"{chapter_no}.pdf created")

sem = asyncio.Semaphore(PUPPETEER_THREADS)
await asyncio.gather(*[render_page(chapter_no, sem) for chapter_no in contents if not os.path.exists(f'{cache_dir}/{chapter_no}.pdf')])
Expand All @@ -256,8 +270,48 @@ async def render_page(chapter_no, semaphore):
print('merging pdf pages...')
merger = PdfMerger()

contents_value = contents # Copy the value of 'contents'
contents_type = type(contents) # Get the type of 'contents'

keys_list = list(contents.keys())
for chapter_no in sorted(contents):
merger.append(f'{cache_dir}/{chapter_no}.pdf')
chapter_pdf_path = os.path.join(cache_dir, f'{chapter_no}.pdf')

# merger.append(f'{cache_dir}/{chapter_no}.pdf')

pdf_document = fitz.open(chapter_pdf_path)
first_page = pdf_document[0]
lines = first_page.get_text("text").strip().split('\n')
first_line = lines[0].strip()
second_line = lines[1].strip() if len(lines) > 1 else ""
isFirstLine = True
is_empty_page = True
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
text = page.get_text("text")
if text.strip():
is_empty_page = False
break
if not is_empty_page:
chapter_title = f"Chapter {chapter_no}"
if first_line:
try:
int(first_line)
isFirstLine = False
except ValueError:
print("ERROR IN GETTING LINE :", ValueError)
if isFirstLine:
chapter_title = first_line
elif second_line:
chapter_title = second_line
else:
chapter_title = f"Chapter {chapter_no}"
merger.append(chapter_pdf_path, bookmark=chapter_title)

pdf_document.close()

# merger.append(chapter_pdf_path, bookmark=chapter_title)


merger.write(f"{book_title}.pdf")
merger.close()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ websocket-client==1.4.0
pyppeteer==1.0.2
PyPDF2==2.10.5
Pillow==8.4.0
fitz==0.0.1