Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
jotpalch authored Aug 4, 2024
2 parents bacbddc + a5696ad commit 1d02056
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 9 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,7 @@ cython_debug/
__MACOSX/

# vscode
.vscode/
.vscode/

# ouputs
crawler/output/
44 changes: 41 additions & 3 deletions crawler/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,45 @@
FROM python:3.10-alpine
# Description: Dockerfile for the selenium crawler

# Use the official debian slim image
FROM python:3.10.14-slim-bullseye as base

# build stage
FROM base as builder

# Define Chrome and ChromeDriver versions
ENV CHROME_VERSION=114.0.5735.90
ENV CHROMEDRIVER_VERSION=114.0.5735.90

# Install all packages for Chrome and ChromeDriver
RUN apt-get update && \
apt-get install -y xvfb gnupg wget curl unzip --no-install-recommends && \
wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
apt-get update -y

# Download and install the specified version of Chrome
RUN wget -q https://mirror.cs.uchicago.edu/google-chrome/pool/main/g/google-chrome-stable/google-chrome-stable_${CHROME_VERSION}-1_amd64.deb
RUN apt-get install -y ./google-chrome-stable_${CHROME_VERSION}-1_amd64.deb

# Download and install the specified version of ChromeDriver
RUN wget -q --continue -P /chromedriver "https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip"
RUN unzip /chromedriver/chromedriver* -d /chromedriver

# Make the chromedriver executable and move it to the default selenium path
RUN chmod +x /chromedriver/chromedriver
RUN mv /chromedriver/chromedriver /usr/bin/chromedriver

# Copy any python requirements file into the install directory and install all python requirements
COPY requirements.txt /requirements.txt
RUN pip install --upgrade --no-cache-dir -r /requirements.txt
RUN rm /requirements.txt # Remove requirements file from container

# Base stage
FROM builder

# Copy the source code
COPY . /app

WORKDIR /app
RUN pip install -r requirements.txt

CMD ["python", "main.py"]
CMD google-chrome --version && chromedriver --version && python main.py
4 changes: 2 additions & 2 deletions crawler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ Step2: Use the extracted voyage id to get the newest event which can be queried
### Run the demo with Docker

```bash
docker build -t crawler .
docker run --rm -v ${PWD}/output:/app/output crawler:latest
docker build --platform linux/amd64 -t crawler .
docker run --platform linux/amd64 --rm -v ${PWD}/output:/app/output crawler:latest
```
3 changes: 2 additions & 1 deletion crawler/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
requests==2.32.3
beautifulsoup4==4.12.3
pandas==2.2.2
psycopg2-binary==2.9.9
psycopg2-binary==2.9.9
selenium==4.10.0
47 changes: 45 additions & 2 deletions crawler/utils/fetch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

def fetch_webpage(url: str) -> str:
"""
Expand All @@ -12,8 +21,42 @@ def fetch_webpage(url: str) -> str:
"""

response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})

# Set up the Chrome WebDriver to run in headless mode (in docker container)
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")

# Use the ChromeDriverManager to automatically download the correct version of the ChromeDriver
service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(url)

# Will concat all the pages
html = driver.page_source

button = driver.find_element(By.ID, 'ASPx_船舶即時動態_DXPagerBottom_PBN')
i = 0
while True:
if i%20 == 19:
button.click()
time.sleep(5)

html += driver.page_source

button = driver.find_element(By.ID, 'ASPx_船舶即時動態_DXPagerBottom_PBN')

if button.get_attribute('onclick') == None:
break

i = i+1

driver.close()
if response.status_code == 200:
return response.text
return html
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
return None
return None

0 comments on commit 1d02056

Please sign in to comment.