From 51408083f5763e705cc142894fa628ef5e6a12a8 Mon Sep 17 00:00:00 2001 From: Abdullah Alqahtani <0xqahta0@gmail.com> Date: Thu, 16 Nov 2023 00:38:36 +0300 Subject: [PATCH] feat: add query param for choosing the language (ar & en) --- .gitignore | 2 + functions/alerts/get_alerts.py | 13 +- functions/crewler/crawle_alerts_ar.py | 124 ++++++++++++++++++ .../{crawle_alerts.py => crawle_alerts_en.py} | 0 4 files changed, 135 insertions(+), 4 deletions(-) create mode 100644 functions/crewler/crawle_alerts_ar.py rename functions/crewler/{crawle_alerts.py => crawle_alerts_en.py} (100%) diff --git a/.gitignore b/.gitignore index e69de29..164eecd 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,2 @@ +crawle_alerts_ar +.env \ No newline at end of file diff --git a/functions/alerts/get_alerts.py b/functions/alerts/get_alerts.py index 997a328..2da6af2 100644 --- a/functions/alerts/get_alerts.py +++ b/functions/alerts/get_alerts.py @@ -8,11 +8,13 @@ def lambda_handler(event, context): page = 1 limit = 10 + lang = 'en' query_params = event.get('queryStringParameters', {}) if query_params: page = int(query_params.get('page', 1)) limit = int(query_params.get('limit', 10)) + lang = query_params.get('lang', 'en') if page < 1 or limit < 1: return { @@ -22,13 +24,15 @@ def lambda_handler(event, context): } try: - alerts, total_alerts = get_alerts(page, limit) + alerts, total_alerts = get_alerts( + page, limit, lang) next_page_url = None total_pages = (total_alerts + limit - 1) // limit if page < total_pages: - next_page_params = urlencode({'page': page + 1, 'limit': limit}) + next_page_params = urlencode( + {'page': page + 1, 'limit': limit, 'lang': lang}) next_page_url = "https://1tozt5y6hl.execute-api.us-east-1.amazonaws.com/default/get_alerts?" + next_page_params return { @@ -50,14 +54,15 @@ def lambda_handler(event, context): } -def get_alerts(page, limit): +def get_alerts(page, limit, lang): try: mongodb_uri = os.getenv('MONGO_URI') client = MongoClient(mongodb_uri, serverSelectionTimeoutMS=5000) client.server_info() db = client['alerts_database'] - collection = db['alerts'] + collection_name = 'alerts_ar' if lang == 'ar' else 'alerts' + collection = db[collection_name] skip = (page - 1) * limit diff --git a/functions/crewler/crawle_alerts_ar.py b/functions/crewler/crawle_alerts_ar.py new file mode 100644 index 0000000..76be879 --- /dev/null +++ b/functions/crewler/crawle_alerts_ar.py @@ -0,0 +1,124 @@ +import os +import json +import requests +from pymongo import MongoClient +from bs4 import BeautifulSoup + + +def lambda_handler(event, context): + try: + from_page = int(event.get("from_page", 1)) + to_page = int(event.get("to_page", 1)) + 1 + + all_alerts = [] + for i in range(from_page, to_page): + data = scrape_page(i) + all_alerts.extend(data) + store_in_mongodb(all_alerts) + return { + 'statusCode': 200, + 'body': json.dumps(f'Successfully processed pages {from_page} to {to_page - 1}') + } + except Exception as e: + return { + 'statusCode': 500, + 'body': json.dumps(f'Error: {str(e)}') + } + + +def store_in_mongodb(data): + mongodb_uri = os.getenv('MONGO_URI') + client = MongoClient(mongodb_uri) + db = client['alerts_database'] + collection = db['alerts_ar'] + + for alert in data: + warning_number = alert['details'].get('warning_number') + if not collection.find_one({'details.warning_number': warning_number}): + collection.insert_one(alert) + else: + print( + f"Alert with warning number {warning_number} already exists, skipping.") + + +def scrape_alert_details(alert_url): + response = requests.get(alert_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + alert_details_div = soup.find( + 'div', class_='cert-body cert-gray-70 m-3') + + details = {} + + if alert_details_div: + columns = alert_details_div.find('div', class_='row pb-5') + if columns: + left_col = columns.find( + 'div', class_='col-5 col-md-auto cert-gray-50').find_all('p') + right_col = columns.find( + 'div', class_='col-7 col-md-9 vertical-line pl-4').find_all('p') + + keys = ["warning_date", "severity_level", + "warning_number", "target_sector"] + for key, value in zip(keys, right_col): + details[key] = value.get_text(strip=True) + + paragraph_count = 1 + list_item_count = 1 + for child in alert_details_div.find_all(['p', 'li', 'strong']): + if child.name == 'p' and child.find('a'): + link_text = child.get_text( + strip=True).split('click')[0].strip() + details[f"link_{paragraph_count}"] = child.find( + 'a').get('href', '') + paragraph_count += 1 + elif child.name == 'li': + details[f"i_{list_item_count}"] = child.get_text( + strip=True) + list_item_count += 1 + elif child.name == 'p': + details[f"p_{paragraph_count}"] = child.get_text( + strip=True) + paragraph_count += 1 + elif child.name == 'strong': + strong_text = child.get_text(strip=True) + if strong_text: + details[f"strong_{paragraph_count}"] = strong_text + paragraph_count += 1 + + return details + else: + return f"Failed to retrieve alert details. Status code: {response.status_code}" + + +def scrape_page(page_number): + url = f"https://cert.gov.sa/ar/security-warnings/?page={page_number}" + response = requests.get(url) + + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + + alerts_severity = soup.find_all('div', class_='card-header') + alerts_title = soup.find_all('p', class_='cert-card-body-warning') + alert_images = soup.find_all( + 'img', class_=['card-img-top', 'security-alerts-cover-image']) + alert_cards = soup.find_all( + 'div', class_='card mb-4 light-gray-border') + alerts_data = [] + + for severity, title, image, card in zip(alerts_severity, alerts_title, alert_images, alert_cards): + alert_url = "https://cert.gov.sa" + card.find('a').get('href') + alert_details = scrape_alert_details(alert_url) + + alert_info = { + "title": title.text.strip(), + "severity": severity.text.strip(), + "logo": "https://cert.gov.sa" + image.get('src'), + "alert_url": alert_url, + "details": alert_details + } + alerts_data.append(alert_info) + + return alerts_data + else: + return f"Failed to retrieve data from page {page_number}. Status code: {response.status_code}" diff --git a/functions/crewler/crawle_alerts.py b/functions/crewler/crawle_alerts_en.py similarity index 100% rename from functions/crewler/crawle_alerts.py rename to functions/crewler/crawle_alerts_en.py