Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CKAN URL Identifier #107

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion html_tag_collector/RootURLCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
import ssl

from common import get_user_agent
from html_tag_collector.common import get_user_agent

DEBUG = False

Expand Down
10 changes: 7 additions & 3 deletions html_tag_collector/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import requests
from requests_html import AsyncHTMLSession
import asyncio
from from_root import from_root
import pyppeteer
from tqdm import tqdm
from tqdm.asyncio import tqdm
Expand All @@ -32,9 +33,12 @@
import polars as pl
from urllib.parse import urlparse

from RootURLCache import RootURLCache
from common import get_user_agent
from DataClassTags import Tags
p = from_root(".gitignore").parent
sys.path.insert(1, str(p))

from html_tag_collector.RootURLCache import RootURLCache
from html_tag_collector.common import get_user_agent
from html_tag_collector.DataClassTags import Tags


# Define the list of header tags we want to extract
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ datasets>=2.17.1
accelerate>=0.27.2
numpy>=1.26.4
multimodal-transformers>=0.3.1
# html_tag_collector_only
# html_tag_collector only
requests_html>=0.10.0
lxml~=5.1.0
lxml_html_clean~=0.4.1
pyppeteer>=2.0.0
beautifulsoup4>=4.12.3
from_root
96 changes: 96 additions & 0 deletions source_collectors/ckan/ckan_identifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""This program identifies if a given URL is a CKAN-hosted website"""

import re
import sys

import asyncio
from bs4 import BeautifulSoup
from from_root import from_root
import polars as pl
from polars.dataframe.frame import DataFrame
import requests

Check warning on line 11 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L11 <401>

'requests' imported but unused
Raw output
./source_collectors/ckan/ckan_identifier.py:11:1: F401 'requests' imported but unused
from requests import Response

p = from_root(".gitignore").parent
sys.path.insert(1, str(p))

from html_tag_collector.collector import run_get_response


def get_responses(urls: list[str]) -> list[Response]:
"""Uses the tag collector's run_get_response method to get response objects for each URL.

:param urls: The list of URLs.
:return: The list of resulting responses.
"""
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run_get_response(urls))
loop.run_until_complete(future)
return future.result()


def is_ckan_hosted(response: Response) -> bool:
"""Checks if the response content contains the CKAN version tag.

:param response: The response object.
:return: True if the CKAN version tag is found, False otherwise.
"""
if not response.ok:
return False

soup = BeautifulSoup(response.content, "lxml")

if soup.head is None:
return False

# Checks if the CKAN version tag is present, looks like this:
# <meta name="generator" content="ckan 2.10.5">
ckan_tag = soup.head.find(content=re.compile("ckan \d+\.\d+\.\d+"))

Check warning on line 48 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L48 <605>

invalid escape sequence '\d'
Raw output
./source_collectors/ckan/ckan_identifier.py:48:56: W605 invalid escape sequence '\d'

Check warning on line 48 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L48 <605>

invalid escape sequence '\.'
Raw output
./source_collectors/ckan/ckan_identifier.py:48:59: W605 invalid escape sequence '\.'

Check warning on line 48 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L48 <605>

invalid escape sequence '\d'
Raw output
./source_collectors/ckan/ckan_identifier.py:48:61: W605 invalid escape sequence '\d'

Check warning on line 48 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L48 <605>

invalid escape sequence '\.'
Raw output
./source_collectors/ckan/ckan_identifier.py:48:64: W605 invalid escape sequence '\.'

Check warning on line 48 in source_collectors/ckan/ckan_identifier.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] source_collectors/ckan/ckan_identifier.py#L48 <605>

invalid escape sequence '\d'
Raw output
./source_collectors/ckan/ckan_identifier.py:48:66: W605 invalid escape sequence '\d'
if ckan_tag is not None:
return True

return False


def ckan_identifier(
urls: list[str] = None, write_output_csv: bool = False
) -> DataFrame:
"""Identifies if each URL in a list is hosted using ckan.

:param urls: List of URLs to identify, defaults to None.
None will use a CSV file specified on the command line at runtime.
:param write_output_csv: Whether to output the results to a CSV file, defaults to False.
:return: Returns a DataFrame with URLs and their labels.
"""
if urls is None:
file = sys.argv[1]
df = pl.read_csv(file)
else:
df = pl.DataFrame([pl.Series("url", urls)])

results = get_responses(urls=list(df["url"]))

results_df = pl.from_dicts(results)
urls_and_responses = pl.DataFrame(
[
pl.Series("url", df["url"]),
pl.Series("response", results_df["response"]),
]
)

# Add a new column indicating if the URL contains the CKAN version tag
urls_and_responses = urls_and_responses.with_columns(
pl.col("response")
.map_elements(is_ckan_hosted, return_dtype=bool)
.alias("is_ckan_hosted")
)

output_columns = urls_and_responses.select(["url", "is_ckan_hosted"])
if write_output_csv is True:
output_columns.write_csv("output.csv")

return output_columns


if __name__ == "__main__":
ckan_identifier(write_output_csv=True)