diff --git a/mltb2/somajo.py b/mltb2/somajo.py index ffbde63..60d7d57 100644 --- a/mltb2/somajo.py +++ b/mltb2/somajo.py @@ -12,7 +12,7 @@ from abc import ABC from dataclasses import dataclass, field -from typing import Container, Iterable, List, Optional, Set, Union +from typing import Container, Dict, Iterable, List, Optional, Set, Tuple, Union from somajo import SoMaJo from tqdm import tqdm @@ -188,3 +188,46 @@ def extract_url_set(self, text: Union[Iterable, str]) -> Set[str]: sentences = self.somajo.tokenize_text(text) result = extract_token_class_set(sentences, keep_token_classes="URL") return result + + +@dataclass +class UrlSwapper: + """Tool to swap (and reverse swap) links with a numbered replacement link. + + Args: + token_extractor: The sentence token extractor to be used. + url_pattern: The pattern to use for replacement. One ``{}`` marks the place where to put the number. + """ + + token_extractor: TokenExtractor + url_pattern: str = "https://link-{}.com" + _url_map: Dict[str, str] = field(init=False, repr=False) # map from real url to swapped url + + def __post_init__(self): + """Do post init.""" + self._url_map = {} + + def swap_urls(self, text: str) -> str: + """Swap the urls of the text.""" + url_set = self.token_extractor.extract_url_set(text) + for url in url_set: + if url not in self._url_map: # if url is unknown: add it + self._url_map[url] = self.url_pattern.format(len(self._url_map) + 1) + text = text.replace(url, self._url_map[url]) # replace + return text + + def reverse_swap_urls(self, text: str) -> Tuple[str, Set[str]]: + """Revert the url swap. + + Returns: + The reverted text and a ``set`` of URLs that were unknown by the ``URLSwapper``. + """ + reverse_url_map = {v: k for k, v in self._url_map.items()} # map from swapped url to real url + url_set = self.token_extractor.extract_url_set(text) + no_reverse_swap_urls = set() + for url in url_set: + if url in reverse_url_map: + text = text.replace(url, reverse_url_map[url]) # replace + else: + no_reverse_swap_urls.add(url) + return text, no_reverse_swap_urls diff --git a/tests/test_somajo.py b/tests/test_somajo.py index e8c69aa..d9981c3 100644 --- a/tests/test_somajo.py +++ b/tests/test_somajo.py @@ -4,9 +4,17 @@ from math import isclose +import pytest from somajo import SoMaJo -from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor, detokenize, extract_token_class_set +from mltb2.somajo import ( + JaccardSimilarity, + SoMaJoSentenceSplitter, + TokenExtractor, + UrlSwapper, + detokenize, + extract_token_class_set, +) def test_SoMaJoSentenceSplitter_call() -> None: @@ -127,3 +135,41 @@ def test_detokenize(): assert isinstance(result, str) assert result == "Das ist ein Satz." + + +def test_UrlSwapper_swap_urls(): + token_extractor = TokenExtractor("de_CMC") + url_swapper = UrlSwapper(token_extractor) + text_with_url = "This is a text with URL: http://may.la." + text_with_swapped_url = url_swapper.swap_urls(text_with_url) + assert text_with_swapped_url == "This is a text with URL: https://link-1.com." + + +@pytest.mark.parametrize( + "text_with_url", + [ + "This is a text with URL: http://may.la.", + "This is a text with Markdown URL: [Philip May](http://may.la).", + "2 MD URL s: [Philip May](http://may.la). [other link](https://github.com/telekom/mltb2#installation)", + ], +) +def test_UrlSwapper__is_reversible(text_with_url: str): + token_extractor = TokenExtractor("de_CMC") + url_swapper = UrlSwapper(token_extractor) + text_with_reverse_swapped_url, no_reverse_swap_urls = url_swapper.reverse_swap_urls( + url_swapper.swap_urls(text_with_url) + ) + assert text_with_reverse_swapped_url == text_with_url + assert len(no_reverse_swap_urls) == 0 + + +def test_UrlSwapper__no_reverse_swap_urls(): + token_extractor = TokenExtractor("de_CMC") + url_swapper = UrlSwapper(token_extractor) + text_with_url = "This is a text with URL: http://may.la." + swapped_url_text = url_swapper.swap_urls(text_with_url) + additional_url = "http://other-url.org" + swapped_url_text = f"{swapped_url_text} {additional_url}" + text_with_reverse_swapped_url, no_reverse_swap_urls = url_swapper.reverse_swap_urls(swapped_url_text) + assert len(no_reverse_swap_urls) == 1 + assert additional_url in no_reverse_swap_urls