Skip to content

Commit 0d5a461

Browse files
feat: Expand always URLs, to avoid loosing some links (#70)
Co-authored-by: Daniel Martin Gonzalez <[email protected]>
1 parent 21fcd54 commit 0d5a461

File tree

2 files changed

+14
-25
lines changed

2 files changed

+14
-25
lines changed

botaffiumeiro.py

+3-16
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,9 @@
2121
load_configuration,
2222
)
2323

24-
SHORT_URL_DOMAINS = ["amzn.to", "amzn.eu", "s.click.aliexpress.com", "bit.ly", "tinyurl.com"]
2524
DOMAIN_PATTERNS = {
2625
"aliexpress": ALIEXPRESS_PATTERN,
2726
}
28-
# "aliexpress_short_url_pattern": r"https?://s\.click\.aliexpress\.com/e/[\w\d_]+",
2927

3028
logging.basicConfig(
3129
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -87,14 +85,6 @@ def extract_embedded_url(query_params):
8785
return embedded_domains
8886

8987

90-
def is_short_url(url: str) -> bool:
91-
"""
92-
Checks if the given URL belongs to a known short URL domain.
93-
"""
94-
parsed_url = urlparse(url)
95-
return parsed_url.netloc in SHORT_URL_DOMAINS
96-
97-
9888
def extract_domains_from_message(message_text: str) -> Tuple[set, str]:
9989
"""
10090
Extracts domains from a message using domain patterns and searches for embedded URLs.
@@ -115,12 +105,9 @@ def extract_domains_from_message(message_text: str) -> Tuple[set, str]:
115105

116106
for url in urls_in_message:
117107
# If it's a short URL, expand it
118-
if is_short_url(url):
119-
expanded_url = expand_shortened_url(url)
120-
# Replace the short URL with the expanded URL in the message text
121-
message_text = message_text.replace(url, expanded_url)
122-
else:
123-
expanded_url = url
108+
expanded_url = expand_shortened_url(url)
109+
# Replace the short URL with the expanded URL in the message text
110+
message_text = message_text.replace(url, expanded_url)
124111

125112
# Now extract the domain from the expanded URL
126113
parsed_url = urlparse(expanded_url)

tests/test_botaffiumeiro.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ def test_mixed_full_and_shortened_urls(self, mock_expand):
370370

371371
# Simulate the expansion of the shortened URLs
372372
mock_expand.side_effect = [
373+
"https://www.amazon.com/dp/product123", # Long URL link, expands as itself
373374
"https://www.amazon.com/dp/product456", # Expanded URL for amzn.to
374375
"https://www.aliexpress.com/item/1005001234567890.html", # Expanded URL for aliexpress shortened link
375376
]
@@ -378,6 +379,7 @@ def test_mixed_full_and_shortened_urls(self, mock_expand):
378379
domains, modified_message = extract_domains_from_message(message_text)
379380

380381
# Check that the expand_shortened_url function was called twice with correct URLs
382+
mock_expand.assert_any_call("https://www.amazon.com/dp/product123")
381383
mock_expand.assert_any_call("https://amzn.to/abc123")
382384
mock_expand.assert_any_call("https://s.click.aliexpress.com/e/buyproduct")
383385

@@ -501,24 +503,24 @@ def test_extract_domains_with_long_urls(self):
501503
"""
502504
Test: Extract domains from long Amazon and AliExpress URLs.
503505
"""
504-
# Texto con URLs largas ya expandidas
506+
# Text with long URLs already expanded
505507
message_text = (
506508
"Check out this Amazon deal: https://www.amazon.com/dp/B08XYZ123 "
507-
"and this AliExpress: https://www.aliexpress.com/item/12345.html"
509+
"and this AliExpress: https://es.aliexpress.com/item/12345.html" ## We use a localized URL because expanding always, can change generic to local URL
508510
)
509511

510-
# Llama a la función que procesa el mensaje
512+
# Call the function that processes the message
511513
domains, modified_message = extract_domains_from_message(message_text)
512514

513-
# Verifica que los dominios correctos fueron extraídos
514-
self.assertIn("amazon.com", domains) # Debería encontrar amazon.com
515-
self.assertIn("aliexpress.com", domains) # Debería encontrar aliexpress.com
515+
# Verify that the correct domains were extracted
516+
self.assertIn("amazon.com", domains) # Should find amazon.com
517+
self.assertIn("aliexpress.com", domains) # Should find aliexpress.com
516518

517-
# Verifica que las URLs completas estén presentes en el mensaje modificado
519+
# Verify that the full URLs are present in the modified message
518520
self.assertIn("https://www.amazon.com/dp/B08XYZ123", modified_message)
519-
self.assertIn("https://www.aliexpress.com/item/12345.html", modified_message)
521+
self.assertIn("aliexpress.com/item/12345.html", modified_message) # Should find aliexpress.com (not checking exact subdomain, as it may expand to different regions)
520522

521-
# Asegúrate de que no hubo modificaciones innecesarias
523+
# Ensure there were no unnecessary modifications
522524
self.assertEqual(message_text, modified_message)
523525

524526

0 commit comments

Comments
 (0)