Skip to content

garethhu/data-collection-pipeline

 
 

Repository files navigation

Data Collection Pipeline

In this project I have...

Technologies used:

  • Python
  • Selenium - used to perform simple functions within the web browser to navigate the website.

Milestone 1/2

I set up my github repo and chose the website I would be scraping. I chose Waterstones and will be focussing on the Thrillers section of the site

Milestone 3

I have built the Scraper class which allows the user to get_website() and accept_cookies(). Following this it scrolls to the bottom of the page and clicks a "See More" button and prints "There are {96} books on this page". It then copies the link for each book into a list_of_links and prints the list. At this stage, I have also extracted data from the first book in the list, such as price, author and star rating and plan to follow this into the next milestone.

from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions() 
options.add_experimental_option("excludeSwitches", ["enable-logging"])

class Scraper:
    def __init__(self):
        self.URL = 'https://www.waterstones.com/category/crime-thrillers-mystery/thrillers/page/1'
        self.driver = webdriver.Chrome(options=options)
        self.one_book = ''

        if __name__ == "__main__":
            self.get_website()
            self.accept_cookies()
            self.get_link()
            self.get_price()
            self.get_author()
            self.get_rating()
            self.scroll_to_more_books()
            self.get_list_of_links()
            self.scroll()


    def get_website(self):
        self.driver.get(self.URL)

    def accept_cookies(self):
        time.sleep(2)
        try:
            accept_cookies_button = self.driver.find_element(by=By.XPATH, value='//*[@id="onetrust-accept-btn-handler"]')
            accept_cookies_button.click()
        except AttributeError:
            accept_cookies_button = self.driver.find_element(by=By.XPATH, value='//*[@id="onetrust-accept-btn-handler"]')
            accept_cookies_button.click()

        except:
            pass

    def get_link(self):
        time.sleep(2)
        self.one_book = self.driver.find_element(by=By.XPATH,value='//*[@data-productid="11647634"]')
        a_tag = self.one_book.find_element(by=By.TAG_NAME, value='a')
        link = a_tag.get_attribute('href')
        print(link)

    def get_price(self):
        price = self.driver.find_element(by=By.XPATH, value='//*[@id="p_11647634"]/div/div[2]/div[2]/span[3]').text
        print(price)
    
    def get_author(self):
        author = self.driver.find_element(by=By.XPATH, value='//*[@id="p_11647634"]/div/div[2]/span/a/b').text
        print(author)

    def get_rating(self):
        rating = self.driver.find_element(by=By.XPATH, value='//*[@id="p_11647634"]/div/div[2]/div[3]').text #TODO find out how to make this work with coloured in stars
        print(rating)

    def scroll(self):
        self.driver.execute_script("window.scrollBy(0,document.body.scrollHeight);")

    def scroll_to_more_books(self):
        scroll = self.scroll()
        time.sleep(1)
        scroll = self.scroll()
        time.sleep(1)
        scroll = self.scroll()
        time.sleep(1)
        show_more_button = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[3]/div[3]/button')
        show_more_button.click()
        time.sleep(2)
        scroll = self.scroll()
        time.sleep(1)

    def get_list_of_links(self):
        book_shelf = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[3]/div[2]')
        book_list = book_shelf.find_elements(by=By.XPATH, value='./div')
        list_of_links = []

        for book in book_list:
            a_tag = book.find_element(by=By.TAG_NAME, value='a')
            link = a_tag.get_attribute('href')
            list_of_links.append(link)

        print (f'There are {len(list_of_links)} books on this page')
        print(list_of_links)

Milestone 4

In this milestone I have used the Scraper() class from the previous milestone as a template to retrieve the text and image data from the page. I created a crawler to iterate through the URLs in my list_of_links and extract the data, storing them in a raw_data folder. Each book is saved in the folder as their unique ISBN number, and each folder contains a Json file with the extracted text data, and a jpeg image of the cover of the book, saved as {ISBN}.jpg. The Json file contains a dictionary of the book Title, Author, Rating, Synopsis, ISBN, Number of Pages and Price.

from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import os
import json
import requests



options = webdriver.ChromeOptions() 
options.add_experimental_option("excludeSwitches", ["enable-logging"])
books_dicts = []

class Scraper:
    def __init__(self):
        self.driver = webdriver.Chrome(options=options)
        self.list_of_links = []
        self.isbn = ''


        if __name__ == "__main__":
            self.get_website()
            self.accept_cookies()
            self.scroll_to_more_books()
            self.get_list_of_links()


    def get_website(self):
        URL = 'https://www.waterstones.com/category/crime-thrillers-mystery/thrillers/format/17'
        self.driver.get(URL)

    def accept_cookies(self):
        time.sleep(2)
        try:
            accept_cookies_button = self.driver.find_element(by=By.XPATH, value='//*[@id="onetrust-accept-btn-handler"]')
            accept_cookies_button.click()
        except:
            pass
    
    def scroll(self):
        self.driver.execute_script("window.scrollBy(0,document.body.scrollHeight);")
        time.sleep(1)

    def click_show_more(self):
        show_more_button = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[3]/div[3]/button')
        show_more_button.click()
        time.sleep(2)

    def scroll_to_more_books(self):
        self.scroll()
        self.scroll()
        self.scroll()
        self.click_show_more()
        self.scroll()

    def get_list_of_links(self):
            book_shelf = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[3]/div[2]')
            book_list = book_shelf.find_elements(by=By.XPATH, value='./div')

            for book in book_list:
                a_tag = book.find_element(by=By.TAG_NAME, value='a')
                link = a_tag.get_attribute('href')
                self.list_of_links.append(link)

            print (f'There are {len(self.list_of_links)} books on this page')
            print(self.list_of_links)
            return self.list_of_links

    def get_title(self):
        title = self.driver.find_element(by=By.XPATH, value='//*[@itemprop="name"]').text
        return title

    def get_price(self):
        price = self.driver.find_element(by=By.XPATH, value='//*[@itemprop="price"]').text
        return price
    
    def get_author(self):
        author = self.driver.find_element(by=By.XPATH, value='//*[@itemprop="author"]').text
        return author

    def get_rating(self):
        full_stars = self.driver.find_elements(by=By.XPATH, value='//*[@class="star-icon full"]') 
        half_star = []
        try:
            half_star.append(self.driver.find_element(by=By.XPATH, value='//*[@class="star-icon half"]'))
        except:
            pass
        rating = len(full_stars) + (len(half_star)/2)
        return rating

    def get_isbn(self):
        isbn = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/section[2]/div[2]/div[1]/div/p/i[2]/span').text
        return isbn

    def get_synopsis(self):
        synopsis = ''
        description = self.driver.find_element(by=By.XPATH, value='//*[@id="scope_book_description"]')
        list_of_paragraphs = description.find_elements(by=By.TAG_NAME, value='p')
        for paragraph in list_of_paragraphs:
            synopsis += paragraph.text
        print(synopsis)
        

    def get_number_of_pages(self):
        number_of_pages = self.driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/section[2]/div[2]/div[1]/div/p/i[3]/span').text
        return number_of_pages

    def get_all_text_data(self):
        title = self.get_title()
        author = self.get_author()
        rating = self.get_rating()
        synopsis = self.get_synopsis()
        self.isbn = self.get_isbn()
        price = self.get_price()
        number_of_pages = self.get_number_of_pages()
        #books_dicts.append({'Title': title, 'Author': author, 'Rating': rating, 'Synopsis': synopsis, 'ISBN': isbn, 'Number of Pages': number_of_pages})
        self.create_product_folder()
        data = {
            'Title': title, 
            'Author': author, 
            'Rating': rating, 
            'Synopsis': synopsis, 
            'ISBN': self.isbn, 
            'Number of Pages': number_of_pages,
            'Price': price
        }
        self.create_json(data)

    def get_cover_image(self):
        img_tag = self.driver.find_element(by=By.XPATH, value='//*[@id="scope_book_image"]')
        image_url = img_tag.get_attribute('src')
        image_data = requests.get(image_url).content
        with open(f'D:/Documents/GitHub/data-collection-pipeline/raw_data/{self.isbn}/{self.isbn}.jpg', 'wb') as handler:
            handler.write(image_data)
        

    def create_raw_data_folder(self):
        if not os.path.exists('D:/Documents/GitHub/data-collection-pipeline/raw_data'):
            os.mkdir('D:/Documents/GitHub/data-collection-pipeline/raw_data')

    def create_product_folder(self):
        os.mkdir(f'D:/Documents/GitHub/data-collection-pipeline/raw_data/{self.isbn}')

    def create_json(self, data):
        with open(f'D:/Documents/GitHub/data-collection-pipeline/raw_data/{self.isbn}/data.json', 'w', encoding='utf-8') as f:
            json.dump(data, f)


def scrape():
    scraper = Scraper()
    for URL in scraper.list_of_links:
        scraper.driver.get(URL)
        time.sleep(1)
        scraper.create_raw_data_folder()
        scraper.get_all_text_data()
        scraper.get_cover_image()

scrape()

Milestone 5

In this milestone I refactored my code, added docstrings and created test_scraper.py which contains a unit test and an intergration test. There is only one part of my scraper that is able to be run without the previous part being completed, which is why there is only one unit test. The unit test tests that the Book class is initialised by checking that the ISBN and Author are correct for the chosen waterstones link (https://www.waterstones.com/book/no-plan-b/lee-child/andrew-child/2928377082253) and that the price is not equal to 50. I decided on this as the ISBN and Author won't change, however the price may vary, meaning the test could fail depending on a varying factor, and not because they scraper isnt performing. The integration test ensures firstly that the Book class is initialilsed, then a product folder with the correct name is created, that the data gets stored as a json file and lastly that the image is stored as a jpg.

from scraper import Scraper
from book import Book
from system import System
from selenium import webdriver
import unittest
from time import sleep
from os import path
from shutil import rmtree


options = webdriver.ChromeOptions() 
options.add_experimental_option("excludeSwitches", ["enable-logging"])

class ScraperTestCase(unittest.TestCase):

    

    def setUp(self):
        url = "https://www.waterstones.com/book/no-plan-b/lee-child/andrew-child/2928377082253"
        self.driver = driver = webdriver.Chrome(options=options)
        scraper = Scraper(driver)
        scraper.load_website(driver)
        sleep(2)
        driver.get(url)
        sleep(2)

        self.book = Book(driver)
        self.system = System()
        self.system.create_raw_data_folder()

    def test_website_creates_book(self):
        self.assertEqual(self.book.isbn, "2928377082253")
        self.assertNotEqual(self.book.price, 50)
        self.assertEqual(self.book.author, "Lee Child")

    def test_scraper(self):
        self.assertEqual(self.book.isbn, "2928377082253")
        self.assertNotEqual(self.book.price, 100)

        self.system.create_product_folder(self.book)
        dir_path = f"raw_data/{self.book.isbn}"
        self.assertTrue(path.exists(dir_path))

        self.book.store_data_to_json()
        dir_path = f"raw_data/{self.book.isbn}/data.json"
        self.assertTrue(path.exists(dir_path))

        self.book.store_cover_image(self.driver)
        dir_path = f"raw_data/{self.book.isbn}/{self.book.isbn}.jpg"
        self.assertTrue(path.exists(dir_path))

    def tearDown(self):
        self.driver.quit()
        self.remove_product_dir()
        del self.book
        del self.system

    def remove_product_dir(self):
        dir_path = f"raw_data/{self.book.isbn}"
        if path.exists(dir_path):
            rmtree(f"raw_data/{self.book.isbn}")
        
unittest.main(argv=[''], verbosity=0, exit=False)

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • Python 96.5%
  • Dockerfile 3.5%