From 2d68c20f7c81329e77e9be782a76f2a0b187e3b2 Mon Sep 17 00:00:00 2001 From: Facundo Rohr Date: Sat, 5 Jan 2019 12:57:26 -0300 Subject: [PATCH 1/2] code refactoring, removing logs --- scribd_dl/scribd_dl.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/scribd_dl/scribd_dl.py b/scribd_dl/scribd_dl.py index 81162e6..ae58647 100644 --- a/scribd_dl/scribd_dl.py +++ b/scribd_dl/scribd_dl.py @@ -23,6 +23,7 @@ GreaterThanLastPageError, RestrictedDocumentError ) +from selenium.webdriver.support import expected_conditions as EC class ScribdDL(object): @@ -91,7 +92,7 @@ def start_browser(self): chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-infobars') chrome_options.add_argument("--window-size=1600,2020") - + if self.DRIVER_PATH: # search for chromedriver in assets self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH, options=chrome_options) else: # search for chromedriver in PATH @@ -158,8 +159,9 @@ def _process_url(self, url): except TimeoutException: pass try: # Refresh the page in case it could not retrieve the total_pages element - total_pages = self.driver.find_element_by_xpath("//span[@class='total_pages']/span[2]") - total_pages = total_pages.text.split()[1] + # changed the xpath of the total_pages element + total_pages = self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[3]/div[1]/div[2]/section[1]/div[1]/div[1]/div[1]/main[1]/div[1]/div[1]/div[2]/div[1]/span[1]/span[2]") + total_pages = total_pages.text.split(' ')[1] break except NoSuchElementException: # total_pages element not available, try again retries += 1 @@ -219,10 +221,31 @@ def _scroll_pages(self, first_page, last_page, total_pages): self.logger.debug('Processing page : %s of %s', counter, last_page, extra=self.extra) time.sleep(sleep_time) + + # + # Converting from RGBA mode to RGB + # img = Image.open(BytesIO(self.driver.get_screenshot_as_png())) # Save screenshot in memory + newImg = img + + bg_colour = (255, 255, 255) + + # + # THIS SHOULD BE A FUNCTION + # + if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info): + # Need to convert to RGBA if LA format due to a bug in PIL (http://stackoverflow.com/a/1963146) + alpha = img.convert('RGBA').split()[-1] + + # Create a new background image of our matt color. + bg = Image.new("RGB", img.size, bg_colour + (255,)) + bg.paste(img, mask=alpha) + newImg = bg + else: + newImg = img # Crop the image to the speified size - img = img.crop(( + newImg = newImg.crop(( page.location['x'], page.location['y'], page.location['x'] + page.size['width'], @@ -230,7 +253,7 @@ def _scroll_pages(self, first_page, last_page, total_pages): )) # Append the byte array to List imgByteArr = BytesIO() - img.save(imgByteArr, format='PNG') + newImg.save(imgByteArr, format='PNG') Pages.append(imgByteArr.getvalue()) if processed == to_process: # If on the last page From fe203916c63446e00bb1f723a874fb43d80de06d Mon Sep 17 00:00:00 2001 From: Facundo Rohr Date: Sun, 6 Jan 2019 22:55:05 -0300 Subject: [PATCH 2/2] remove scribd page and search bar from doc --- scribd_dl/scribd_dl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scribd_dl/scribd_dl.py b/scribd_dl/scribd_dl.py index ae58647..b9ac499 100644 --- a/scribd_dl/scribd_dl.py +++ b/scribd_dl/scribd_dl.py @@ -197,9 +197,10 @@ def _process_url(self, url): self._scroll_pages(first_page, last_page, total_pages) def _scroll_pages(self, first_page, last_page, total_pages): - # Enter full screen mode - fullscreen_xpath = "//button[@aria-label='Fullscreen']" - self.driver.find_element_by_xpath(fullscreen_xpath).click() + # Fullscreen mode commented to obtain document without the pages bar + + # fullscreen_xpath = "//button[@aria-label='Fullscreen']" + # self.driver.find_element_by_xpath(fullscreen_xpath).click() Pages = [] # Holds the actual image bytes of each page Sizes = [] # Holds the size in bytes (an integer) of each page to_process = last_page - first_page + 1 # Total pages to process