diff --git a/Beginner_Projects/quotes.toscrape.com/output.xlsx b/Beginner_Projects/quotes.toscrape.com/output.xlsx new file mode 100644 index 0000000000..4ffa979ba2 Binary files /dev/null and b/Beginner_Projects/quotes.toscrape.com/output.xlsx differ diff --git a/Beginner_Projects/quotes.toscrape.com/readme.md b/Beginner_Projects/quotes.toscrape.com/readme.md new file mode 100644 index 0000000000..c08cb955db --- /dev/null +++ b/Beginner_Projects/quotes.toscrape.com/readme.md @@ -0,0 +1,53 @@ +A simple web scraping program using Python that retrieves data from a website. For this example, we will scrape quotes from [quotes.toscrape.com](http://quotes.toscrape.com/), which is a site specifically designed for practicing web scraping. + +### Simple Web Scraping Program + +#### Requirements +You'll need to install the `requests` and `BeautifulSoup` libraries. You can do this using pip: + +```bash +pip install requests beautifulsoup4 +``` + +### Explanation +1. **Import Libraries**: The program imports the `requests` library to handle HTTP requests and `BeautifulSoup` from `bs4` to parse HTML content. + +2. **Function Definition**: The `scrape_quotes()` function: + - Defines the URL of the site to scrape. + - Sends a GET request to fetch the webpage content. + - Checks if the response status code is 200 (OK). + - Parses the HTML content using BeautifulSoup. + - Finds all quote elements by searching for `div` tags with the class `quote`. + - Loops through each quote element, extracting the text and the author, and prints them. + +3. **Run the Scraper**: The last line calls the `scrape_quotes()` function to execute the scraping process. + +### How to Run the Program +1. Ensure you have Python installed on your machine. +2. Install the required libraries as mentioned above. +3. Copy the provided code into a Python file (e.g., `scrape_quotes.py`). +4. Run the script from your terminal or command prompt: + + ```bash + python scrape_quotes.py + ``` + +### Output +The program will print out the quotes and their authors from the specified webpage, like this: + +``` +Quote: "The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking." +Author: Albert Einstein + +Quote: "It is our choices, Harry, that show what we truly are, far more than our abilities." +Author: J.K. Rowling + +... +``` + +### Note +- The example above is designed for educational purposes and uses a public website that allows scraping. +- Always check a website's `robots.txt` file and terms of service to ensure that scraping is allowed. + +If you have any questions or need further help with web scraping, feel free to ask! +Here's a simple script that scrapes quotes and their authors from the website: \ No newline at end of file diff --git a/Beginner_Projects/quotes.toscrape.com/scrap.ipynb b/Beginner_Projects/quotes.toscrape.com/scrap.ipynb new file mode 100644 index 0000000000..ee3b2b777b --- /dev/null +++ b/Beginner_Projects/quotes.toscrape.com/scrap.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quotes have been written to quotes.xlsx\n" + ] + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import pandas as pd\n", + "\n", + "def scrape_quotes():\n", + " # URL of the site to scrape\n", + " url = 'http://quotes.toscrape.com/'\n", + " \n", + " # Send a GET request to fetch the webpage content\n", + " response = requests.get(url)\n", + "\n", + " # Check if the request was successful\n", + " if response.status_code == 200:\n", + " # Parse the webpage content\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # Find all quote elements\n", + " quotes = soup.find_all('div', class_='quote')\n", + "\n", + " # Lists to hold quotes and authors\n", + " quotes_list = []\n", + " authors_list = []\n", + "\n", + " # Loop through the quotes and store the text and author\n", + " for quote in quotes:\n", + " text = quote.find('span', class_='text').text\n", + " author = quote.find('small', class_='author').text\n", + " quotes_list.append(text)\n", + " authors_list.append(author)\n", + "\n", + " # Create a DataFrame\n", + " quotes_df = pd.DataFrame({\n", + " 'Quote': quotes_list,\n", + " 'Author': authors_list\n", + " })\n", + "\n", + " # Write the DataFrame to an Excel file\n", + " quotes_df.to_excel('quotes.xlsx', index=False)\n", + " print(\"Quotes have been written to quotes.xlsx\")\n", + "\n", + " else:\n", + " print(f'Failed to retrieve webpage. Status code: {response.status_code}')\n", + "\n", + "# Run the scraper\n", + "scrape_quotes()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}