From ba185a774eb0127e286da3657ea3d5f009a9d6dc Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 9 Apr 2024 13:24:02 +0500 Subject: [PATCH 1/4] Add a flag for skipping universal patterns. --- tests/test_matcher.py | 13 +++++++++++++ url_matcher/matcher.py | 17 ++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index f0bbaec..9333df4 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -161,3 +161,16 @@ def test_match_all(): assert list(matcher.match_all("http://example.com/products")) == [1] assert list(matcher.match_all("http://foo.example.com/products")) == [2, 1] assert list(matcher.match_all("http://bar.example.com/products")) == [3, 4, 1] + + +def test_include_universal(): + matcher = URLMatcher() + matcher.add_or_update(1, Patterns(include=["example.com"])) + matcher.add_or_update(2, Patterns(include=[])) + matcher.add_or_update(3, Patterns(include=["foo.example.com"])) + assert list(matcher.match_all("http://example.com")) == [1, 2] + assert list(matcher.match_all("http://example.com", include_universal=False)) == [1] + assert list(matcher.match_all("http://foo.example.com")) == [3, 1, 2] + assert list(matcher.match_all("http://foo.example.com", include_universal=False)) == [3, 1] + assert list(matcher.match_all("http://example.net")) == [2] + assert list(matcher.match_all("http://example.net", include_universal=False)) == [2] diff --git a/url_matcher/matcher.py b/url_matcher/matcher.py index 99a3544..daae028 100644 --- a/url_matcher/matcher.py +++ b/url_matcher/matcher.py @@ -3,7 +3,6 @@ """ from dataclasses import dataclass, field -from itertools import chain from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple, Union from url_matcher.patterns import PatternMatcher, get_pattern_domain, hierarchical_str @@ -151,14 +150,22 @@ def remove(self, identifier: Any): def get(self, identifier: Any) -> Optional[Patterns]: return self.patterns.get(identifier) - def match(self, url: str) -> Optional[Any]: - return next(self.match_all(url), None) + def match(self, url: str, *, include_universal=True) -> Optional[Any]: + return next(self.match_all(url, include_universal=include_universal), None) - def match_all(self, url: str) -> Iterator[Any]: + def match_all(self, url: str, *, include_universal=True) -> Iterator[Any]: domain = get_domain(url) - for matcher in chain(self.matchers_by_domain.get(domain) or [], self.matchers_by_domain.get("") or []): + domain_matchers = self.matchers_by_domain.get(domain) or [] + domain_match = False + for matcher in domain_matchers: if matcher.match(url): + domain_match = True yield matcher.identifier + if include_universal or not domain_match: + universal_matchers = self.matchers_by_domain.get("") or [] + for matcher in universal_matchers: + if matcher.match(url): + yield matcher.identifier def _sort_domain(self, domain: str): """ From 4bd19b7e2b57eb59387b629ab0bee534248c74a7 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 10 Apr 2024 17:13:32 +0500 Subject: [PATCH 2/4] More tests. --- tests/test_matcher.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 9333df4..0c42368 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -168,9 +168,10 @@ def test_include_universal(): matcher.add_or_update(1, Patterns(include=["example.com"])) matcher.add_or_update(2, Patterns(include=[])) matcher.add_or_update(3, Patterns(include=["foo.example.com"])) - assert list(matcher.match_all("http://example.com")) == [1, 2] + matcher.add_or_update(4, Patterns(include=[""])) + assert list(matcher.match_all("http://example.com")) == [1, 4, 2] assert list(matcher.match_all("http://example.com", include_universal=False)) == [1] - assert list(matcher.match_all("http://foo.example.com")) == [3, 1, 2] + assert list(matcher.match_all("http://foo.example.com")) == [3, 1, 4, 2] assert list(matcher.match_all("http://foo.example.com", include_universal=False)) == [3, 1] - assert list(matcher.match_all("http://example.net")) == [2] - assert list(matcher.match_all("http://example.net", include_universal=False)) == [2] + assert list(matcher.match_all("http://example.net")) == [4, 2] + assert list(matcher.match_all("http://example.net", include_universal=False)) == [4, 2] From d3d9815e486263cb6babdda51a528b01101686f2 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 11 Apr 2024 18:52:45 +0500 Subject: [PATCH 3/4] Add URLMatcher.match_universal(). --- tests/test_matcher.py | 13 +++++++++++-- url_matcher/matcher.py | 25 ++++++++++++++++--------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 0c42368..a70ee5d 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -163,7 +163,7 @@ def test_match_all(): assert list(matcher.match_all("http://bar.example.com/products")) == [3, 4, 1] -def test_include_universal(): +def test_match_all_include_universal(): matcher = URLMatcher() matcher.add_or_update(1, Patterns(include=["example.com"])) matcher.add_or_update(2, Patterns(include=[])) @@ -174,4 +174,13 @@ def test_include_universal(): assert list(matcher.match_all("http://foo.example.com")) == [3, 1, 4, 2] assert list(matcher.match_all("http://foo.example.com", include_universal=False)) == [3, 1] assert list(matcher.match_all("http://example.net")) == [4, 2] - assert list(matcher.match_all("http://example.net", include_universal=False)) == [4, 2] + assert list(matcher.match_all("http://example.net", include_universal=False)) == [] + + +def test_match_universal(): + matcher = URLMatcher() + matcher.add_or_update(1, Patterns(include=["example.com"])) + matcher.add_or_update(2, Patterns(include=[])) + matcher.add_or_update(3, Patterns(include=["foo.example.com"])) + matcher.add_or_update(4, Patterns(include=[""])) + assert list(matcher.match_universal()) == [4, 2] diff --git a/url_matcher/matcher.py b/url_matcher/matcher.py index daae028..2c8c269 100644 --- a/url_matcher/matcher.py +++ b/url_matcher/matcher.py @@ -3,6 +3,7 @@ """ from dataclasses import dataclass, field +from itertools import chain from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple, Union from url_matcher.patterns import PatternMatcher, get_pattern_domain, hierarchical_str @@ -106,6 +107,7 @@ def __init__(self, data: Union[Mapping[Any, Patterns], Iterable[Tuple[Any, Patte initialize the object from """ self.matchers_by_domain: Dict[str, List[PatternsMatcher]] = {} + self.matchers_universal: List[PatternsMatcher] = [] self.patterns: Dict[Any, Patterns] = {} if data: @@ -155,17 +157,15 @@ def match(self, url: str, *, include_universal=True) -> Optional[Any]: def match_all(self, url: str, *, include_universal=True) -> Iterator[Any]: domain = get_domain(url) - domain_matchers = self.matchers_by_domain.get(domain) or [] - domain_match = False - for matcher in domain_matchers: + matchers: Iterable[PatternsMatcher] = self.matchers_by_domain.get(domain) or [] + if include_universal: + matchers = chain(matchers, self.matchers_universal) + for matcher in matchers: if matcher.match(url): - domain_match = True yield matcher.identifier - if include_universal or not domain_match: - universal_matchers = self.matchers_by_domain.get("") or [] - for matcher in universal_matchers: - if matcher.match(url): - yield matcher.identifier + + def match_universal(self) -> Iterator[Any]: + return (m.identifier for m in self.matchers_universal) def _sort_domain(self, domain: str): """ @@ -186,6 +186,7 @@ def sort_key(matcher: PatternsMatcher) -> Tuple: return (matcher.patterns.priority, sorted_includes, matcher.identifier) self.matchers_by_domain[domain].sort(key=sort_key, reverse=True) + self.matchers_universal.sort(key=sort_key, reverse=True) def _del_matcher(self, domain: str, identifier: Any): matchers = self.matchers_by_domain[domain] @@ -195,10 +196,16 @@ def _del_matcher(self, domain: str, identifier: Any): break if not matchers: del self.matchers_by_domain[domain] + for idx in range(len(self.matchers_universal)): + if self.matchers_universal[idx].identifier == identifier: + del self.matchers_universal[idx] + break def _add_matcher(self, domain: str, matcher: PatternsMatcher): # FIXME: This can be made much more efficient if we insert the data directly in order instead of resorting. # The bisect module could be used for this purpose. # I'm leaving it for the future as insertion time is not critical. self.matchers_by_domain.setdefault(domain, []).append(matcher) + if domain == "": + self.matchers_universal.append(matcher) self._sort_domain(domain) From b7107a85a0d97ced799780ff22b78ab7d7fe3e30 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 12 Apr 2024 18:14:29 +0500 Subject: [PATCH 4/4] Check universal matchers in removal tests. --- tests/test_matcher.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index a70ee5d..56fdc7e 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -74,6 +74,7 @@ def test_matcher_add_remove_get(): matcher.add_or_update(1, patterns) assert matcher.match("http://example.com") == 1 assert matcher.get(1) is patterns + assert list(matcher.match_universal()) == [] patterns_3 = Patterns(["example.com/articles"]) matcher.add_or_update(3, patterns_3) @@ -93,12 +94,14 @@ def test_matcher_add_remove_get(): assert matcher.match("http://example.com") == 2 assert matcher.match("http://example.com/products") == 1 assert matcher.get(2) is univ_patterns + assert list(matcher.match_universal()) == [2] # Removing a universal pattern matcher.remove(2) assert matcher.match("http://example.com") is None assert matcher.match("http://example.com/products") == 1 assert matcher.get(2) is None + assert list(matcher.match_universal()) == [] # Removing regular patterns matcher.remove(3)