Skip to content

Commit

Permalink
Add URL swap tool. (#91)
Browse files Browse the repository at this point in the history
* add UrlSwapper

* fix typo

* add test
  • Loading branch information
PhilipMay authored Sep 4, 2023
1 parent 198fe30 commit db18e32
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 2 deletions.
45 changes: 44 additions & 1 deletion mltb2/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from abc import ABC
from dataclasses import dataclass, field
from typing import Container, Iterable, List, Optional, Set, Union
from typing import Container, Dict, Iterable, List, Optional, Set, Tuple, Union

from somajo import SoMaJo
from tqdm import tqdm
Expand Down Expand Up @@ -188,3 +188,46 @@ def extract_url_set(self, text: Union[Iterable, str]) -> Set[str]:
sentences = self.somajo.tokenize_text(text)
result = extract_token_class_set(sentences, keep_token_classes="URL")
return result


@dataclass
class UrlSwapper:
"""Tool to swap (and reverse swap) links with a numbered replacement link.
Args:
token_extractor: The sentence token extractor to be used.
url_pattern: The pattern to use for replacement. One ``{}`` marks the place where to put the number.
"""

token_extractor: TokenExtractor
url_pattern: str = "https://link-{}.com"
_url_map: Dict[str, str] = field(init=False, repr=False) # map from real url to swapped url

def __post_init__(self):
"""Do post init."""
self._url_map = {}

def swap_urls(self, text: str) -> str:
"""Swap the urls of the text."""
url_set = self.token_extractor.extract_url_set(text)
for url in url_set:
if url not in self._url_map: # if url is unknown: add it
self._url_map[url] = self.url_pattern.format(len(self._url_map) + 1)
text = text.replace(url, self._url_map[url]) # replace
return text

def reverse_swap_urls(self, text: str) -> Tuple[str, Set[str]]:
"""Revert the url swap.
Returns:
The reverted text and a ``set`` of URLs that were unknown by the ``URLSwapper``.
"""
reverse_url_map = {v: k for k, v in self._url_map.items()} # map from swapped url to real url
url_set = self.token_extractor.extract_url_set(text)
no_reverse_swap_urls = set()
for url in url_set:
if url in reverse_url_map:
text = text.replace(url, reverse_url_map[url]) # replace
else:
no_reverse_swap_urls.add(url)
return text, no_reverse_swap_urls
48 changes: 47 additions & 1 deletion tests/test_somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@

from math import isclose

import pytest
from somajo import SoMaJo

from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor, detokenize, extract_token_class_set
from mltb2.somajo import (
JaccardSimilarity,
SoMaJoSentenceSplitter,
TokenExtractor,
UrlSwapper,
detokenize,
extract_token_class_set,
)


def test_SoMaJoSentenceSplitter_call() -> None:
Expand Down Expand Up @@ -127,3 +135,41 @@ def test_detokenize():

assert isinstance(result, str)
assert result == "Das ist ein Satz."


def test_UrlSwapper_swap_urls():
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_url = "This is a text with URL: http://may.la."
text_with_swapped_url = url_swapper.swap_urls(text_with_url)
assert text_with_swapped_url == "This is a text with URL: https://link-1.com."


@pytest.mark.parametrize(
"text_with_url",
[
"This is a text with URL: http://may.la.",
"This is a text with Markdown URL: [Philip May](http://may.la).",
"2 MD URL s: [Philip May](http://may.la). [other link](https://github.com/telekom/mltb2#installation)",
],
)
def test_UrlSwapper__is_reversible(text_with_url: str):
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_reverse_swapped_url, no_reverse_swap_urls = url_swapper.reverse_swap_urls(
url_swapper.swap_urls(text_with_url)
)
assert text_with_reverse_swapped_url == text_with_url
assert len(no_reverse_swap_urls) == 0


def test_UrlSwapper__no_reverse_swap_urls():
token_extractor = TokenExtractor("de_CMC")
url_swapper = UrlSwapper(token_extractor)
text_with_url = "This is a text with URL: http://may.la."
swapped_url_text = url_swapper.swap_urls(text_with_url)
additional_url = "http://other-url.org"
swapped_url_text = f"{swapped_url_text} {additional_url}"
text_with_reverse_swapped_url, no_reverse_swap_urls = url_swapper.reverse_swap_urls(swapped_url_text)
assert len(no_reverse_swap_urls) == 1
assert additional_url in no_reverse_swap_urls

0 comments on commit db18e32

Please sign in to comment.