diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py index 2c40edd4..0aadf91e 100644 --- a/html_tag_collector/RootURLCache.py +++ b/html_tag_collector/RootURLCache.py @@ -6,7 +6,7 @@ import os import ssl -from common import get_user_agent +from html_tag_collector.common import get_user_agent DEBUG = False diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 26767dc2..696ac0b4 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -24,6 +24,7 @@ import requests from requests_html import AsyncHTMLSession import asyncio +from from_root import from_root import pyppeteer from tqdm import tqdm from tqdm.asyncio import tqdm @@ -32,9 +33,12 @@ import polars as pl from urllib.parse import urlparse -from RootURLCache import RootURLCache -from common import get_user_agent -from DataClassTags import Tags +p = from_root(".gitignore").parent +sys.path.insert(1, str(p)) + +from html_tag_collector.RootURLCache import RootURLCache +from html_tag_collector.common import get_user_agent +from html_tag_collector.DataClassTags import Tags # Define the list of header tags we want to extract diff --git a/requirements.txt b/requirements.txt index 85b8ed8d..035ddf8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,8 +19,10 @@ datasets>=2.17.1 accelerate>=0.27.2 numpy>=1.26.4 multimodal-transformers>=0.3.1 -# html_tag_collector_only +# html_tag_collector only requests_html>=0.10.0 lxml~=5.1.0 +lxml_html_clean~=0.4.1 pyppeteer>=2.0.0 beautifulsoup4>=4.12.3 +from_root diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py new file mode 100644 index 00000000..c79a695d --- /dev/null +++ b/source_collectors/ckan/ckan_identifier.py @@ -0,0 +1,96 @@ +"""This program identifies if a given URL is a CKAN-hosted website""" + +import re +import sys + +import asyncio +from bs4 import BeautifulSoup +from from_root import from_root +import polars as pl +from polars.dataframe.frame import DataFrame +import requests +from requests import Response + +p = from_root(".gitignore").parent +sys.path.insert(1, str(p)) + +from html_tag_collector.collector import run_get_response + + +def get_responses(urls: list[str]) -> list[Response]: + """Uses the tag collector's run_get_response method to get response objects for each URL. + + :param urls: The list of URLs. + :return: The list of resulting responses. + """ + loop = asyncio.get_event_loop() + future = asyncio.ensure_future(run_get_response(urls)) + loop.run_until_complete(future) + return future.result() + + +def is_ckan_hosted(response: Response) -> bool: + """Checks if the response content contains the CKAN version tag. + + :param response: The response object. + :return: True if the CKAN version tag is found, False otherwise. + """ + if not response.ok: + return False + + soup = BeautifulSoup(response.content, "lxml") + + if soup.head is None: + return False + + # Checks if the CKAN version tag is present, looks like this: + # + ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+")) + if ckan_tag is not None: + return True + + return False + + +def ckan_identifier( + urls: list[str] = None, write_output_csv: bool = False +) -> DataFrame: + """Identifies if each URL in a list is hosted using ckan. + + :param urls: List of URLs to identify, defaults to None. + None will use a CSV file specified on the command line at runtime. + :param write_output_csv: Whether to output the results to a CSV file, defaults to False. + :return: Returns a DataFrame with URLs and their labels. + """ + if urls is None: + file = sys.argv[1] + df = pl.read_csv(file) + else: + df = pl.DataFrame([pl.Series("url", urls)]) + + results = get_responses(urls=list(df["url"])) + + results_df = pl.from_dicts(results) + urls_and_responses = pl.DataFrame( + [ + pl.Series("url", df["url"]), + pl.Series("response", results_df["response"]), + ] + ) + + # Add a new column indicating if the URL contains the CKAN version tag + urls_and_responses = urls_and_responses.with_columns( + pl.col("response") + .map_elements(is_ckan_hosted, return_dtype=bool) + .alias("is_ckan_hosted") + ) + + output_columns = urls_and_responses.select(["url", "is_ckan_hosted"]) + if write_output_csv is True: + output_columns.write_csv("output.csv") + + return output_columns + + +if __name__ == "__main__": + ckan_identifier(write_output_csv=True)