Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

htmldate and courlan: update setup and tests #444

Merged
merged 6 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_long_description():
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
"htmldate[speed] >= 1.5.1",
"htmldate[speed] >= 1.6.0",
"py3langid >= 0.2.2",
"pycurl >= 7.45.2",
],
Expand Down Expand Up @@ -111,8 +111,8 @@ def get_long_description():
"certifi",
"charset_normalizer >= 3.0.1; python_version < '3.7'",
"charset_normalizer >= 3.2.0; python_version >= '3.7'",
"courlan >= 0.9.4",
"htmldate >= 1.5.1",
"courlan >= 0.9.5",
"htmldate >= 1.6.0",
"justext >= 3.0.0",
"lxml >= 4.9.3 ; platform_system != 'Darwin'",
"lxml == 4.9.2 ; platform_system == 'Darwin'",
Expand Down
2 changes: 1 addition & 1 deletion tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_dates():
metadata = extract_metadata(mystring, fastmode=False)
assert metadata.date == '2017-09-01'
metadata = extract_metadata(mystring, fastmode=True)
assert metadata.date is None
assert metadata.date == '2017-09-01'


def test_sitename():
Expand Down
3 changes: 2 additions & 1 deletion tests/realworld_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,8 +700,9 @@ def test_pages():
assert metadata.url == url

url = 'https://www.ndr.de/nachrichten/info/16-Coronavirus-Update-Wir-brauchen-Abkuerzungen-bei-der-Impfstoffzulassung,podcastcoronavirus140.html'
corrected_url = 'https://www.ndr.de/nachrichten/info/16-Coronavirus-Update-Wir-brauchen-Abkuerzungen-bei-der-Impfstoffzulassung,podcastcoronavirus140.html'
metadata = extract_metadata(load_mock_page_meta(url), default_url=url)
assert metadata.url == url
assert metadata.url == corrected_url
assert 'Korinna Hennig' in metadata.author
assert 'Ältere Menschen' in str(metadata.tags)

Expand Down
4 changes: 2 additions & 2 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
filter_urls,
fix_relative_urls,
get_hostinfo,
validate_url,
is_valid_url,
)

from .downloads import fetch_url
Expand Down Expand Up @@ -186,7 +186,7 @@ def determine_feed(htmlstring: str, params: FeedParameters) -> List[str]:
for link in uniquify_list(feed_urls):
link = fix_relative_urls(params.base, link)
link = clean_url(link)
if link is None or link == params.ref or validate_url(link)[0] is False:
if link is None or link == params.ref or not is_valid_url(link):
continue
if BLACKLIST.search(link):
continue
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
from copy import deepcopy

from courlan import extract_domain, get_base_url, normalize_url, validate_url
from courlan import extract_domain, get_base_url, is_valid_url, normalize_url, validate_url
from htmldate import find_date
from lxml.html import tostring

Expand Down Expand Up @@ -173,7 +173,7 @@ def extract_opengraph(tree):
title = elem.get('content')
# orig URL
elif elem.get('property') == 'og:url':
if validate_url(elem.get('content'))[0] is True:
if is_valid_url(elem.get('content')):
url = elem.get('content')
# description
elif elem.get('property') == 'og:description':
Expand Down Expand Up @@ -250,7 +250,7 @@ def examine_meta(tree):
backup_sitename = content_attr
# url
elif name_attr == 'twitter:url':
if url is None and validate_url(content_attr)[0] is True:
if url is None and is_valid_url(content_attr):
url = content_attr
# keywords
elif name_attr in METANAME_TAG: # 'page-topic'
Expand Down
Loading