From b77d81d77944cf9a5334920a44a5b7164233d709 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Wed, 20 Nov 2024 17:07:10 -0700
Subject: [PATCH 1/6] Initial commit

---
 source_collectors/ckan/ckan_identifier.py | 27 +++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 source_collectors/ckan/ckan_identifier.py

diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py
new file mode 100644
index 00000000..d900ce59
--- /dev/null
+++ b/source_collectors/ckan/ckan_identifier.py
@@ -0,0 +1,27 @@
+"""This program identifies if a given URL is a CKAN-hosted website"""
+import re
+
+from bs4 import BeautifulSoup
+from from_root import from_root
+import requests
+
+
+def is_ckan_hosted(url):
+    head = requests.get(url)
+    soup = BeautifulSoup(head.content, "lxml")
+    
+    ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))
+    if ckan_tag is not None:
+        return True
+    
+    return False
+
+
+def main():
+    url = "https://www.w3schools.com/python/python_regex.asp"
+    print(is_ckan_hosted(url))
+
+
+if __name__ == "__main__":
+    main()
+    
\ No newline at end of file

From 2a5026be0d1c2908f6df22b996d0b197559acb68 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 21 Nov 2024 18:23:01 -0700
Subject: [PATCH 2/6] Update collector with from_root

---
 html_tag_collector/RootURLCache.py |  2 +-
 html_tag_collector/collector.py    | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/html_tag_collector/RootURLCache.py b/html_tag_collector/RootURLCache.py
index 2c40edd4..0aadf91e 100644
--- a/html_tag_collector/RootURLCache.py
+++ b/html_tag_collector/RootURLCache.py
@@ -6,7 +6,7 @@
 import os
 import ssl
 
-from common import get_user_agent
+from html_tag_collector.common import get_user_agent
 
 DEBUG = False
 
diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 26767dc2..696ac0b4 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -24,6 +24,7 @@
 import requests
 from requests_html import AsyncHTMLSession
 import asyncio
+from from_root import from_root
 import pyppeteer
 from tqdm import tqdm
 from tqdm.asyncio import tqdm
@@ -32,9 +33,12 @@
 import polars as pl
 from urllib.parse import urlparse
 
-from RootURLCache import RootURLCache
-from common import get_user_agent
-from DataClassTags import Tags
+p = from_root(".gitignore").parent
+sys.path.insert(1, str(p))
+
+from html_tag_collector.RootURLCache import RootURLCache
+from html_tag_collector.common import get_user_agent
+from html_tag_collector.DataClassTags import Tags
 
 
 # Define the list of header tags we want to extract

From c5a4f7a22c9b14c8fc4a035169a54ac971024c25 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 21 Nov 2024 18:23:30 -0700
Subject: [PATCH 3/6] Update requirements.txt

---
 requirements.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 85b8ed8d..035ddf8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,8 +19,10 @@ datasets>=2.17.1
 accelerate>=0.27.2
 numpy>=1.26.4
 multimodal-transformers>=0.3.1
-# html_tag_collector_only
+# html_tag_collector only
 requests_html>=0.10.0
 lxml~=5.1.0
+lxml_html_clean~=0.4.1
 pyppeteer>=2.0.0
 beautifulsoup4>=4.12.3
+from_root

From 241c6ac9db0b17e38174cbeede1ab4731265f7ab Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 21 Nov 2024 18:24:01 -0700
Subject: [PATCH 4/6] Create identifier

---
 source_collectors/ckan/ckan_identifier.py | 62 ++++++++++++++++++++---
 1 file changed, 54 insertions(+), 8 deletions(-)

diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py
index d900ce59..a3aa9acb 100644
--- a/source_collectors/ckan/ckan_identifier.py
+++ b/source_collectors/ckan/ckan_identifier.py
@@ -1,27 +1,73 @@
 """This program identifies if a given URL is a CKAN-hosted website"""
+
 import re
+import sys
 
+import asyncio
 from bs4 import BeautifulSoup
 from from_root import from_root
+import polars as pl
 import requests
+from requests import Response
+
+p = from_root(".gitignore").parent
+sys.path.insert(1, str(p))
+
+from html_tag_collector.collector import run_get_response
+
+
+def get_responses(urls: list[str]) -> list[Response]:
+    """Uses the tag collector's run_get_response method to get response objects for each url.
+
+    :param urls: The list of urls.
+    :return: The list of resulting responses.
+    """
+    loop = asyncio.get_event_loop()
+    future = asyncio.ensure_future(run_get_response(urls))
+    loop.run_until_complete(future)
+    return future.result()
+
 
+def is_ckan_hosted(response: Response) -> bool:
+    """Checks if the response content contains the CKAN version tag.
 
-def is_ckan_hosted(url):
-    head = requests.get(url)
-    soup = BeautifulSoup(head.content, "lxml")
-    
+    :param response: The response object.
+    :return: True if the CKAN version tag is found, False otherwise.
+    """
+    soup = BeautifulSoup(response.content, "lxml")
+
+    # Checks if the CKAN version tag is present, looks like this:
+    # <meta name="generator" content="ckan 2.10.5">
     ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))
     if ckan_tag is not None:
         return True
-    
+
     return False
 
 
 def main():
-    url = "https://www.w3schools.com/python/python_regex.asp"
-    print(is_ckan_hosted(url))
+    file = sys.argv[1]
+    df = pl.read_csv(file)
+
+    results = get_responses(urls=list(df["url"]))
+
+    results_df = pl.from_dicts(results)
+    urls_and_responses = pl.DataFrame(
+        [
+            pl.Series("url", df["url"]),
+            pl.Series("response", results_df["response"]),
+        ]
+    )
+
+    # Add a new column indicating if the URL contains the CKAN version tag
+    urls_and_responses = urls_and_responses.with_columns(
+        pl.col("response")
+        .map_elements(is_ckan_hosted, return_dtype=bool)
+        .alias("is_ckan_hosted")
+    )
+
+    urls_and_responses.select(["url", "is_ckan_hosted"]).write_csv("output.csv")
 
 
 if __name__ == "__main__":
     main()
-    
\ No newline at end of file

From 1b6ee2e621e3a54e63faffbfdf30e3b5d298ec5d Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 21 Nov 2024 18:52:57 -0700
Subject: [PATCH 5/6] Modularize the identifier

---
 source_collectors/ckan/ckan_identifier.py | 31 ++++++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py
index a3aa9acb..7b6a762d 100644
--- a/source_collectors/ckan/ckan_identifier.py
+++ b/source_collectors/ckan/ckan_identifier.py
@@ -7,6 +7,7 @@
 from bs4 import BeautifulSoup
 from from_root import from_root
 import polars as pl
+from polars.dataframe.frame import DataFrame
 import requests
 from requests import Response
 
@@ -17,9 +18,9 @@
 
 
 def get_responses(urls: list[str]) -> list[Response]:
-    """Uses the tag collector's run_get_response method to get response objects for each url.
+    """Uses the tag collector's run_get_response method to get response objects for each URL.
 
-    :param urls: The list of urls.
+    :param urls: The list of URLs.
     :return: The list of resulting responses.
     """
     loop = asyncio.get_event_loop()
@@ -45,9 +46,21 @@ def is_ckan_hosted(response: Response) -> bool:
     return False
 
 
-def main():
-    file = sys.argv[1]
-    df = pl.read_csv(file)
+def ckan_identifier(
+    urls: list[str] = None, write_output_csv: bool = False
+) -> DataFrame:
+    """Identifies if each URL in a list is hosted using ckan.
+
+    :param urls: List of URLs to identify, defaults to None.
+    None will use a CSV file specified on the command line at runtime.
+    :param write_output_csv: Whether to output the results to a CSV file, defaults to False.
+    :return: Returns a DataFrame with URLs and their labels.
+    """
+    if urls is None:
+        file = sys.argv[1]
+        df = pl.read_csv(file)
+    else:
+        df = pl.DataFrame([pl.Series("url", urls)])
 
     results = get_responses(urls=list(df["url"]))
 
@@ -66,8 +79,12 @@ def main():
         .alias("is_ckan_hosted")
     )
 
-    urls_and_responses.select(["url", "is_ckan_hosted"]).write_csv("output.csv")
+    output_columns = urls_and_responses.select(["url", "is_ckan_hosted"])
+    if write_output_csv is True:
+        output_columns.write_csv("output.csv")
+
+    return output_columns
 
 
 if __name__ == "__main__":
-    main()
+    ckan_identifier(write_output_csv=True)

From 560637f410c183fb887f6f0b63821d1a19a5f6a8 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 21 Nov 2024 22:59:01 -0700
Subject: [PATCH 6/6] Bug fixes

---
 source_collectors/ckan/ckan_identifier.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/source_collectors/ckan/ckan_identifier.py b/source_collectors/ckan/ckan_identifier.py
index 7b6a762d..c79a695d 100644
--- a/source_collectors/ckan/ckan_identifier.py
+++ b/source_collectors/ckan/ckan_identifier.py
@@ -35,8 +35,14 @@ def is_ckan_hosted(response: Response) -> bool:
     :param response: The response object.
     :return: True if the CKAN version tag is found, False otherwise.
     """
+    if not response.ok:
+        return False
+
     soup = BeautifulSoup(response.content, "lxml")
 
+    if soup.head is None:
+        return False
+
     # Checks if the CKAN version tag is present, looks like this:
     # <meta name="generator" content="ckan 2.10.5">
     ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))