Skip to content
This repository has been archived by the owner on Dec 1, 2022. It is now read-only.

Remove page and searchbar #22

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions scribd_dl/scribd_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
GreaterThanLastPageError,
RestrictedDocumentError
)
from selenium.webdriver.support import expected_conditions as EC


class ScribdDL(object):
Expand Down Expand Up @@ -91,7 +92,7 @@ def start_browser(self):
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument("--window-size=1600,2020")

if self.DRIVER_PATH: # search for chromedriver in assets
self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH, options=chrome_options)
else: # search for chromedriver in PATH
Expand Down Expand Up @@ -158,8 +159,9 @@ def _process_url(self, url):
except TimeoutException:
pass
try: # Refresh the page in case it could not retrieve the total_pages element
total_pages = self.driver.find_element_by_xpath("//span[@class='total_pages']/span[2]")
total_pages = total_pages.text.split()[1]
# changed the xpath of the total_pages element
total_pages = self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[3]/div[1]/div[2]/section[1]/div[1]/div[1]/div[1]/main[1]/div[1]/div[1]/div[2]/div[1]/span[1]/span[2]")
total_pages = total_pages.text.split(' ')[1]
break
except NoSuchElementException: # total_pages element not available, try again
retries += 1
Expand Down Expand Up @@ -195,9 +197,10 @@ def _process_url(self, url):
self._scroll_pages(first_page, last_page, total_pages)

def _scroll_pages(self, first_page, last_page, total_pages):
# Enter full screen mode
fullscreen_xpath = "//button[@aria-label='Fullscreen']"
self.driver.find_element_by_xpath(fullscreen_xpath).click()
# Fullscreen mode commented to obtain document without the pages bar

# fullscreen_xpath = "//button[@aria-label='Fullscreen']"
# self.driver.find_element_by_xpath(fullscreen_xpath).click()
Pages = [] # Holds the actual image bytes of each page
Sizes = [] # Holds the size in bytes (an integer) of each page
to_process = last_page - first_page + 1 # Total pages to process
Expand All @@ -219,18 +222,39 @@ def _scroll_pages(self, first_page, last_page, total_pages):
self.logger.debug('Processing page : %s of %s', counter, last_page, extra=self.extra)

time.sleep(sleep_time)

#
# Converting from RGBA mode to RGB
#
img = Image.open(BytesIO(self.driver.get_screenshot_as_png())) # Save screenshot in memory
newImg = img

bg_colour = (255, 255, 255)

#
# THIS SHOULD BE A FUNCTION
#
if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
# Need to convert to RGBA if LA format due to a bug in PIL (http://stackoverflow.com/a/1963146)
alpha = img.convert('RGBA').split()[-1]

# Create a new background image of our matt color.
bg = Image.new("RGB", img.size, bg_colour + (255,))
bg.paste(img, mask=alpha)
newImg = bg
else:
newImg = img

# Crop the image to the speified size
img = img.crop((
newImg = newImg.crop((
page.location['x'],
page.location['y'],
page.location['x'] + page.size['width'],
page.location['y'] + page.size['height']
))
# Append the byte array to List
imgByteArr = BytesIO()
img.save(imgByteArr, format='PNG')
newImg.save(imgByteArr, format='PNG')
Pages.append(imgByteArr.getvalue())

if processed == to_process: # If on the last page
Expand Down