diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py
index 2c40edd4..0aadf91e 100644
--- a/html_tag_collector/RootURLCache.py
+++ b/html_tag_collector/RootURLCache.py
@@ -6,7 +6,7 @@
import os
import ssl
-from common import get_user_agent
+from html_tag_collector.common import get_user_agent
DEBUG = False
diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 26767dc2..696ac0b4 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -24,6 +24,7 @@
import requests
from requests_html import AsyncHTMLSession
import asyncio
+from from_root import from_root
import pyppeteer
from tqdm import tqdm
from tqdm.asyncio import tqdm
@@ -32,9 +33,12 @@
import polars as pl
from urllib.parse import urlparse
-from RootURLCache import RootURLCache
-from common import get_user_agent
-from DataClassTags import Tags
+p = from_root(".gitignore").parent
+sys.path.insert(1, str(p))
+
+from html_tag_collector.RootURLCache import RootURLCache
+from html_tag_collector.common import get_user_agent
+from html_tag_collector.DataClassTags import Tags
# Define the list of header tags we want to extract
diff --git a/requirements.txt b/requirements.txt
index 85b8ed8d..035ddf8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,8 +19,10 @@ datasets>=2.17.1
accelerate>=0.27.2
numpy>=1.26.4
multimodal-transformers>=0.3.1
-# html_tag_collector_only
+# html_tag_collector only
requests_html>=0.10.0
lxml~=5.1.0
+lxml_html_clean~=0.4.1
pyppeteer>=2.0.0
beautifulsoup4>=4.12.3
+from_root
diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py
new file mode 100644
index 00000000..c79a695d
--- /dev/null
+++ b/source_collectors/ckan/ckan_identifier.py
@@ -0,0 +1,96 @@
+"""This program identifies if a given URL is a CKAN-hosted website"""
+
+import re
+import sys
+
+import asyncio
+from bs4 import BeautifulSoup
+from from_root import from_root
+import polars as pl
+from polars.dataframe.frame import DataFrame
+import requests
+from requests import Response
+
+p = from_root(".gitignore").parent
+sys.path.insert(1, str(p))
+
+from html_tag_collector.collector import run_get_response
+
+
+def get_responses(urls: list[str]) -> list[Response]:
+ """Uses the tag collector's run_get_response method to get response objects for each URL.
+
+ :param urls: The list of URLs.
+ :return: The list of resulting responses.
+ """
+ loop = asyncio.get_event_loop()
+ future = asyncio.ensure_future(run_get_response(urls))
+ loop.run_until_complete(future)
+ return future.result()
+
+
+def is_ckan_hosted(response: Response) -> bool:
+ """Checks if the response content contains the CKAN version tag.
+
+ :param response: The response object.
+ :return: True if the CKAN version tag is found, False otherwise.
+ """
+ if not response.ok:
+ return False
+
+ soup = BeautifulSoup(response.content, "lxml")
+
+ if soup.head is None:
+ return False
+
+ # Checks if the CKAN version tag is present, looks like this:
+ #
+ ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))
+ if ckan_tag is not None:
+ return True
+
+ return False
+
+
+def ckan_identifier(
+ urls: list[str] = None, write_output_csv: bool = False
+) -> DataFrame:
+ """Identifies if each URL in a list is hosted using ckan.
+
+ :param urls: List of URLs to identify, defaults to None.
+ None will use a CSV file specified on the command line at runtime.
+ :param write_output_csv: Whether to output the results to a CSV file, defaults to False.
+ :return: Returns a DataFrame with URLs and their labels.
+ """
+ if urls is None:
+ file = sys.argv[1]
+ df = pl.read_csv(file)
+ else:
+ df = pl.DataFrame([pl.Series("url", urls)])
+
+ results = get_responses(urls=list(df["url"]))
+
+ results_df = pl.from_dicts(results)
+ urls_and_responses = pl.DataFrame(
+ [
+ pl.Series("url", df["url"]),
+ pl.Series("response", results_df["response"]),
+ ]
+ )
+
+ # Add a new column indicating if the URL contains the CKAN version tag
+ urls_and_responses = urls_and_responses.with_columns(
+ pl.col("response")
+ .map_elements(is_ckan_hosted, return_dtype=bool)
+ .alias("is_ckan_hosted")
+ )
+
+ output_columns = urls_and_responses.select(["url", "is_ckan_hosted"])
+ if write_output_csv is True:
+ output_columns.write_csv("output.csv")
+
+ return output_columns
+
+
+if __name__ == "__main__":
+ ckan_identifier(write_output_csv=True)