-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #588 from TomCan/url-transformer
Add UrlTransformerService and integrate in LinkifyExtension
- Loading branch information
Showing
4 changed files
with
218 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
<?php | ||
|
||
namespace App\Service; | ||
|
||
class UrlTransformerService | ||
{ | ||
/* | ||
* Known hostformats. | ||
* key is regex to match host portion of url (parse_url) | ||
* value are <type>[,parameters...] where type denotes the affiliate program later used for the transformation logic | ||
*/ | ||
private array $hostFormats = [ | ||
'/^(www\.)?amazon\.(com|co\.(jp|uk|za)|com\.(au|be|br|mx|tr)|ae|ca|cn|de|eg|es|fr|ie|in|it|nl|pl|sa|se|sg)$/' => 'amazon', | ||
'/^(www\.)?bol\.com$/' => 'bol', | ||
'/^(www\.)?coolblue\.be$/' => 'awin,85165', | ||
]; | ||
|
||
private $partnerIds = []; | ||
|
||
public function __construct() | ||
{ | ||
// Get all partner ids from ENV. | ||
// Multiple ids are supported by space seperating them. | ||
foreach ($_ENV as $key => $value) { | ||
if (0 === strpos($key, 'PARTNER_')) { | ||
$this->partnerIds[strtolower(substr($key, 8))] = array_filter(explode(' ', $value)); | ||
} | ||
} | ||
} | ||
|
||
public function extractUrls(string $text): array | ||
{ | ||
$pattern = '#\bhttps?://[^,\s()<>]+(?:\([\w\d]+\)|([^,[:punct:]\s]|/))#'; | ||
if (preg_match_all($pattern, $text, $matches)) { | ||
// remove duplicates | ||
$urls = array_unique($matches[0]); | ||
|
||
return $urls; | ||
} | ||
|
||
return []; | ||
} | ||
|
||
/** | ||
* Replace all urls in input array (as key) with its replacements (values). | ||
* | ||
* Custom function because str_replace could replace occurrences from previous replacements | ||
* or urls that are child/parent urls of other urls that need to be replaced | ||
*/ | ||
public function replaceUrls(string $text, array $urls): string | ||
{ | ||
// urls contain the original urls as key, and replacements as value | ||
$urlsOnly = array_keys($urls); | ||
|
||
// sort by length longest to shortest | ||
usort($urlsOnly, function ($a, $b) { | ||
return strlen($b) - strlen($a); | ||
}); | ||
|
||
// create index of all positions of urls, where a position can only be taken by the longest url (child/parent) | ||
$byUrl = []; | ||
$byPosition = []; | ||
foreach ($urlsOnly as $url) { | ||
$byUrl[$url] = []; | ||
// get first match | ||
$position = strpos($text, $url, 0); | ||
while (false !== $position) { | ||
if (!isset($byPosition[$position])) { | ||
// position not already matched with longer url | ||
$byUrl[$url][] = $position; | ||
$byPosition[$position] = $url; | ||
} | ||
// find next occurrence | ||
$position = strpos($text, $url, $position + strlen($url)); | ||
} | ||
} | ||
|
||
// start replacements back to front to not mess up earlier positions | ||
krsort($byPosition, SORT_NUMERIC); | ||
foreach ($byPosition as $position => $url) { | ||
$text = substr($text, 0, $position). | ||
$urls[$url]. | ||
substr($text, $position + strlen($url)) | ||
; | ||
} | ||
|
||
return $text; | ||
} | ||
|
||
public function transformUrl(string $url): string | ||
{ | ||
// parse URL into parts | ||
$urlParts = parse_url($url); | ||
|
||
// find matching hostpattern | ||
$matchedFormat = ''; | ||
foreach ($this->hostFormats as $hostFormat => $key) { | ||
if (preg_match($hostFormat, $urlParts['host'])) { | ||
$matchedFormat = $key; | ||
break; | ||
} | ||
} | ||
|
||
if ($matchedFormat) { | ||
// split by comma. Shift first element off and use as key to identify type of link. | ||
$params = explode(',', $matchedFormat); | ||
$key = array_shift($params); | ||
// have we configured a partner id for this program? | ||
if (isset($this->partnerIds[$key][0]) && $this->partnerIds[$key][0]) { | ||
if (count($this->partnerIds[$key]) > 1) { | ||
// select random id from array | ||
$partnerId = $this->partnerIds[$key][array_rand($this->partnerIds[$key])]; | ||
} else { | ||
$partnerId = $this->partnerIds[$key][0]; | ||
} | ||
|
||
switch ($key) { | ||
case 'amazon': | ||
// append id as tag parameter | ||
if (isset($urlParts['query'])) { | ||
$url .= '&tag='.$partnerId; | ||
} else { | ||
$url .= '?tag='.$partnerId; | ||
} | ||
break; | ||
|
||
case 'bol': | ||
// generate text link to partner program and append original URL encoded | ||
$url = 'https://partner.bol.com/click/click?p=1&t=url&s='.$partnerId.'&f=TXL&url='.urlencode($url); | ||
break; | ||
|
||
case 'tradetracker': | ||
// params[0] should contain campaignid, append original URL encoded | ||
$url = 'https://tc.tradetracker.net/?c='.$params[0].'&m=12&a='.$partnerId.'&r=&u='.urlencode($urlParts['path']); | ||
if (isset($urlParts['query'])) { | ||
$url .= urlencode('?'.$urlParts['query']); | ||
} | ||
break; | ||
|
||
case 'awin': | ||
// params[0] should contain merchantid, append original URL encoded | ||
$url = 'https://www.awin1.com/cread.php?awinmid='.$params[0].'&awinaffid='.$this->partnerIds[$key].'&ued='.urlencode($url); | ||
break; | ||
|
||
default: | ||
// No matching format | ||
} | ||
} | ||
} | ||
|
||
return $url; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
<?php | ||
|
||
namespace App\Tests\Unit\Service; | ||
|
||
use App\Service\UrlTransformerService; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
class UrlTransformerTest extends TestCase | ||
{ | ||
// Simple party with simple excludes | ||
public function testUrlExtraction() | ||
{ | ||
$urlTransformer = new UrlTransformerService(); | ||
|
||
$urls = $urlTransformer->extractUrls('https://www.test.com'); | ||
$this->assertEquals(1, count($urls)); | ||
$this->assertEquals('https://www.test.com', $urls[0]); | ||
} | ||
|
||
public function testAmazonUrlExtraction() | ||
{ | ||
$_ENV['PARTNER_AMAZON'] = 'abc-123'; | ||
$urlTransformer = new UrlTransformerService(); | ||
|
||
$url = $urlTransformer->transformUrl('https://www.amazon.com/Zmart-Funny-Christmas-Coworkers-Secret/dp/B0CC1S12S3'); | ||
$this->assertEquals('https://www.amazon.com/Zmart-Funny-Christmas-Coworkers-Secret/dp/B0CC1S12S3?tag=abc-123', $url); | ||
|
||
$url = $urlTransformer->transformUrl('https://www.amazon.com/Zmart-Funny-Christmas-Coworkers-Secret/dp/B0CC1S12S3?crid=123456789'); | ||
$this->assertEquals('https://www.amazon.com/Zmart-Funny-Christmas-Coworkers-Secret/dp/B0CC1S12S3?crid=123456789&tag=abc-123', $url); | ||
} | ||
|
||
public function testurlReplacement() | ||
{ | ||
$urlTransformer = new UrlTransformerService(); | ||
|
||
$html = 'Multiple links to https://www.tom.be, again https://www.tom.be and a child https://www.tom.be/zeb and a child https://www.tom.be/arne'; | ||
$replacements = [ | ||
'https://www.tom.be' => 'https://www.tom.be/zeb', | ||
'https://www.tom.be/zeb' => 'https://www.tom.be/Zeb', | ||
'https://www.tom.be/arne' => 'https://www.tom.be/Arne', | ||
]; | ||
// It should not replace the /zeb instances that are the result of the replacements of https://www.tom.be. | ||
// It should also not replace https://www.tom.be in any of the other child urls, but instead replace them with the capital names. | ||
$expectedHtml = 'Multiple links to https://www.tom.be/zeb, again https://www.tom.be/zeb and a child https://www.tom.be/Zeb and a child https://www.tom.be/Arne'; | ||
|
||
$actual = $urlTransformer->replaceUrls($html, $replacements); | ||
$this->assertEquals($expectedHtml, $actual); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters