From b77d81d77944cf9a5334920a44a5b7164233d709 Mon Sep 17 00:00:00 2001 From: Kylie Date: Wed, 20 Nov 2024 17:07:10 -0700 Subject: [PATCH 1/6] Initial commit --- source_collectors/ckan/ckan_identifier.py | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 source_collectors/ckan/ckan_identifier.py diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py new file mode 100644 index 00000000..d900ce59 --- /dev/null +++ b/source_collectors/ckan/ckan_identifier.py @@ -0,0 +1,27 @@ +"""This program identifies if a given URL is a CKAN-hosted website""" +import re + +from bs4 import BeautifulSoup +from from_root import from_root +import requests + + +def is_ckan_hosted(url): + head = requests.get(url) + soup = BeautifulSoup(head.content, "lxml") + + ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+")) + if ckan_tag is not None: + return True + + return False + + +def main(): + url = "https://www.w3schools.com/python/python_regex.asp" + print(is_ckan_hosted(url)) + + +if __name__ == "__main__": + main() + \ No newline at end of file From 2a5026be0d1c2908f6df22b996d0b197559acb68 Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 21 Nov 2024 18:23:01 -0700 Subject: [PATCH 2/6] Update collector with from_root --- html_tag_collector/RootURLCache.py | 2 +- html_tag_collector/collector.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py index 2c40edd4..0aadf91e 100644 --- a/html_tag_collector/RootURLCache.py +++ b/html_tag_collector/RootURLCache.py @@ -6,7 +6,7 @@ import os import ssl -from common import get_user_agent +from html_tag_collector.common import get_user_agent DEBUG = False diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 26767dc2..696ac0b4 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -24,6 +24,7 @@ import requests from requests_html import AsyncHTMLSession import asyncio +from from_root import from_root import pyppeteer from tqdm import tqdm from tqdm.asyncio import tqdm @@ -32,9 +33,12 @@ import polars as pl from urllib.parse import urlparse -from RootURLCache import RootURLCache -from common import get_user_agent -from DataClassTags import Tags +p = from_root(".gitignore").parent +sys.path.insert(1, str(p)) + +from html_tag_collector.RootURLCache import RootURLCache +from html_tag_collector.common import get_user_agent +from html_tag_collector.DataClassTags import Tags # Define the list of header tags we want to extract From c5a4f7a22c9b14c8fc4a035169a54ac971024c25 Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 21 Nov 2024 18:23:30 -0700 Subject: [PATCH 3/6] Update requirements.txt --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 85b8ed8d..035ddf8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,8 +19,10 @@ datasets>=2.17.1 accelerate>=0.27.2 numpy>=1.26.4 multimodal-transformers>=0.3.1 -# html_tag_collector_only +# html_tag_collector only requests_html>=0.10.0 lxml~=5.1.0 +lxml_html_clean~=0.4.1 pyppeteer>=2.0.0 beautifulsoup4>=4.12.3 +from_root From 241c6ac9db0b17e38174cbeede1ab4731265f7ab Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 21 Nov 2024 18:24:01 -0700 Subject: [PATCH 4/6] Create identifier --- source_collectors/ckan/ckan_identifier.py | 62 ++++++++++++++++++++--- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py index d900ce59..a3aa9acb 100644 --- a/source_collectors/ckan/ckan_identifier.py +++ b/source_collectors/ckan/ckan_identifier.py @@ -1,27 +1,73 @@ """This program identifies if a given URL is a CKAN-hosted website""" + import re +import sys +import asyncio from bs4 import BeautifulSoup from from_root import from_root +import polars as pl import requests +from requests import Response + +p = from_root(".gitignore").parent +sys.path.insert(1, str(p)) + +from html_tag_collector.collector import run_get_response + + +def get_responses(urls: list[str]) -> list[Response]: + """Uses the tag collector's run_get_response method to get response objects for each url. + + :param urls: The list of urls. + :return: The list of resulting responses. + """ + loop = asyncio.get_event_loop() + future = asyncio.ensure_future(run_get_response(urls)) + loop.run_until_complete(future) + return future.result() + +def is_ckan_hosted(response: Response) -> bool: + """Checks if the response content contains the CKAN version tag. -def is_ckan_hosted(url): - head = requests.get(url) - soup = BeautifulSoup(head.content, "lxml") - + :param response: The response object. + :return: True if the CKAN version tag is found, False otherwise. + """ + soup = BeautifulSoup(response.content, "lxml") + + # Checks if the CKAN version tag is present, looks like this: + # ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+")) if ckan_tag is not None: return True - + return False def main(): - url = "https://www.w3schools.com/python/python_regex.asp" - print(is_ckan_hosted(url)) + file = sys.argv[1] + df = pl.read_csv(file) + + results = get_responses(urls=list(df["url"])) + + results_df = pl.from_dicts(results) + urls_and_responses = pl.DataFrame( + [ + pl.Series("url", df["url"]), + pl.Series("response", results_df["response"]), + ] + ) + + # Add a new column indicating if the URL contains the CKAN version tag + urls_and_responses = urls_and_responses.with_columns( + pl.col("response") + .map_elements(is_ckan_hosted, return_dtype=bool) + .alias("is_ckan_hosted") + ) + + urls_and_responses.select(["url", "is_ckan_hosted"]).write_csv("output.csv") if __name__ == "__main__": main() - \ No newline at end of file From 1b6ee2e621e3a54e63faffbfdf30e3b5d298ec5d Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 21 Nov 2024 18:52:57 -0700 Subject: [PATCH 5/6] Modularize the identifier --- source_collectors/ckan/ckan_identifier.py | 31 ++++++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py index a3aa9acb..7b6a762d 100644 --- a/source_collectors/ckan/ckan_identifier.py +++ b/source_collectors/ckan/ckan_identifier.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup from from_root import from_root import polars as pl +from polars.dataframe.frame import DataFrame import requests from requests import Response @@ -17,9 +18,9 @@ def get_responses(urls: list[str]) -> list[Response]: - """Uses the tag collector's run_get_response method to get response objects for each url. + """Uses the tag collector's run_get_response method to get response objects for each URL. - :param urls: The list of urls. + :param urls: The list of URLs. :return: The list of resulting responses. """ loop = asyncio.get_event_loop() @@ -45,9 +46,21 @@ def is_ckan_hosted(response: Response) -> bool: return False -def main(): - file = sys.argv[1] - df = pl.read_csv(file) +def ckan_identifier( + urls: list[str] = None, write_output_csv: bool = False +) -> DataFrame: + """Identifies if each URL in a list is hosted using ckan. + + :param urls: List of URLs to identify, defaults to None. + None will use a CSV file specified on the command line at runtime. + :param write_output_csv: Whether to output the results to a CSV file, defaults to False. + :return: Returns a DataFrame with URLs and their labels. + """ + if urls is None: + file = sys.argv[1] + df = pl.read_csv(file) + else: + df = pl.DataFrame([pl.Series("url", urls)]) results = get_responses(urls=list(df["url"])) @@ -66,8 +79,12 @@ def main(): .alias("is_ckan_hosted") ) - urls_and_responses.select(["url", "is_ckan_hosted"]).write_csv("output.csv") + output_columns = urls_and_responses.select(["url", "is_ckan_hosted"]) + if write_output_csv is True: + output_columns.write_csv("output.csv") + + return output_columns if __name__ == "__main__": - main() + ckan_identifier(write_output_csv=True) From 560637f410c183fb887f6f0b63821d1a19a5f6a8 Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 21 Nov 2024 22:59:01 -0700 Subject: [PATCH 6/6] Bug fixes --- source_collectors/ckan/ckan_identifier.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py index 7b6a762d..c79a695d 100644 --- a/source_collectors/ckan/ckan_identifier.py +++ b/source_collectors/ckan/ckan_identifier.py @@ -35,8 +35,14 @@ def is_ckan_hosted(response: Response) -> bool: :param response: The response object. :return: True if the CKAN version tag is found, False otherwise. """ + if not response.ok: + return False + soup = BeautifulSoup(response.content, "lxml") + if soup.head is None: + return False + # Checks if the CKAN version tag is present, looks like this: # ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))