Skip to content

Commit

Permalink
Merge pull request #749 from CamKem/fix/guzzle-issues
Browse files Browse the repository at this point in the history
Fix: Parse URL's via Tokenization
  • Loading branch information
nunomaduro authored Dec 26, 2024
2 parents 9573e05 + c0b3a72 commit 29126de
Show file tree
Hide file tree
Showing 22 changed files with 188 additions and 52 deletions.
15 changes: 9 additions & 6 deletions app/Services/MetaData.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
namespace App\Services;

use DOMDocument;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\TransferException;
use GuzzleHttp\Psr7\Exception\MalformedUriException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\HttpClientException;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
Expand Down Expand Up @@ -97,12 +99,13 @@ private function getData(): Collection
$response = Http::get($this->url);

if ($response->ok()) {
$data = $this->parse($response->body());
$data = $this->parse(
$response->body()
);
}
} catch (ConnectionException) {
// Catch but not capture the exception
} catch (RequestException) {
// Catch but not capture the exception
} catch (HttpClientException|MalformedUriException|TransferException) {
// Catch but not capture all base exceptions for:
// Laravel Http Client, Guzzle, and PSR-7
}

return $data;
Expand Down
159 changes: 115 additions & 44 deletions app/Services/ParsableContentProviders/LinkProviderParsable.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,121 @@
*/
public function parse(string $content): string
{
return (string) preg_replace_callback(
'/(<(a|code|pre)\s+[^>]*>.*?<\/\2>)|(?<!src=")((https?:\/\/)?((localhost)|((?:\d{1,3}\.){3}\d{1,3})|[\w\-._@:%\+~#=]{1,256}(\.[a-zA-Z]{2,})+)(:\d+)?(\/[\w\-._@:%\+~#=\/]*)?(\?[\w\-._@:%\+~#=\/&]*)?)(?<!\.)((?![^<]*>|[^<>]*<\/))/is',
function (array $matches): string {
if ($matches[1] !== '') {
return $matches[1];
}

$humanUrl = Str::of($matches[0])
->replaceMatches('/^https?:\/\//', '')
->rtrim('/')
->toString();

$isMail = (bool) preg_match('/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/', $humanUrl);
$isHttp = Str::startsWith($matches[0], ['http://', 'https://']);

if ((! $isMail) && (! $isHttp)) {
return $matches[0];
}

$url = $isHttp ? $matches[0] : 'https://'.$matches[0];

$url = $isMail ? 'mailto:'.$humanUrl : $url;

$linkHtml = '<a data-navigate-ignore="true" class="text-blue-500 hover:underline hover:text-blue-700 cursor-pointer" target="_blank" href="'.$url.'">'.$humanUrl.'</a>';

if (! $isMail && $url) {
$service = new MetaData($url);
$metadata = $service->fetch();

if ($metadata->isNotEmpty() && ($metadata->has('image') || $metadata->has('html'))) {
$trimmed = trim(
view('components.link-preview-card', [
'data' => $metadata,
'url' => $url,
])->render()
);

return $linkHtml.' '.preg_replace('/<!--(.|\s)*?-->/', '', $trimmed);
}
}

return $linkHtml;
},
str_replace('&amp;', '&', $content)
$tokens = $this->tokenize($content);

if ($tokens === false) {
return $content;
}

$processedTokens = array_map(
fn (string $token): string => $this->processToken($token),
$tokens);

return implode('', $processedTokens);
}

/**
* Split the content into tokens based on spaces and newlines.
*
* @return list<string>|false
*/
private function tokenize(string $content): array|false
{
return preg_split('/(\s|<br>)/', $content, -1, PREG_SPLIT_DELIM_CAPTURE);
}

/**
* Process a single token and convert valid URLs into HTML links.
*/
private function processToken(string $token): string
{
$allowableAttachedCharacters = '{([<!,.?;:>)]}';

$trimmedToken = trim($token, $allowableAttachedCharacters);

if ($trimmedToken === '' || $trimmedToken === '0') {
return $token;
}

if (filter_var($trimmedToken, FILTER_VALIDATE_EMAIL)) {
$trimmedToken = "mailto:{$trimmedToken}";
} elseif (! $this->isValidUrl($trimmedToken)) {
return $token;
}

$humanUrl = Str::of($trimmedToken)
->replaceMatches('/^(https?:\/\/|mailto:)/', '')
->rtrim('/')
->toString();

$linkHtml = "<a data-navigate-ignore=\"true\" class=\"text-blue-500 hover:underline hover:text-blue-700 cursor-pointer\" target=\"_blank\" href=\"{$trimmedToken}\">{$humanUrl}</a>";

$service = new MetaData($trimmedToken);
$metadata = $service->fetch();
if ($metadata->isNotEmpty() && ($metadata->has('image') || $metadata->has('html'))) {
$trimmedPreviewCard = trim(
view('components.link-preview-card', [
'data' => $metadata,
'url' => $trimmedToken,
])->render()
);

$linkHtml .= $trimmedPreviewCard;
}

$leading = $this->getCharacters($token, $allowableAttachedCharacters, 'leading');
$trailing = $this->getCharacters($token, $allowableAttachedCharacters, 'trailing');

return $leading.$linkHtml.$trailing;
}

/**
* Extract leading or trailing punctuation/characters from a token.
*/
private function getCharacters(string $token, string $allowableCharacters, string $direction): string
{
$pattern = match ($direction) {
'leading' => '/^(['.preg_quote($allowableCharacters, '/').']+)/',
'trailing' => '/(['.preg_quote($allowableCharacters, '/').']+)$/',
default => '',
};

if (preg_match($pattern, $token, $matches)) {
return $matches[1];
}

return '';
}

/**
* Validate if a token is a valid URL.
*/
private function isValidUrl(string $token): bool
{
$urlComponents = parse_url($token);
if ($urlComponents === false || ! filter_var($token, FILTER_VALIDATE_URL)) {
return false;
}

$scheme = $urlComponents['scheme'] ?? null;
$host = $urlComponents['host'] ?? null;
if (! in_array($scheme, ['http', 'https'], true) || ! filter_var($host, FILTER_VALIDATE_DOMAIN, FILTER_FLAG_HOSTNAME)) {
return false;
}

foreach (['path', 'query', 'fragment'] as $part) {
if (isset($urlComponents[$part]) && preg_match('/[\s<>{}[\]]/', $urlComponents[$part])) {
return false;
}
}

if (isset($urlComponents['port']) && (preg_match('/^\d{1,5}$/', (string) $urlComponents['port']) === 0 || preg_match('/^\d{1,5}$/', (string) $urlComponents['port']) === false)) {
return false;
}

return (bool) preg_match(
'/((https?:\/\/)?((localhost)|((?:\d{1,3}\.){3}\d{1,3})|[\w\-._@:%+~#=]{1,256}(\.[a-zA-Z]{2,})+)(:\d+)?(\/[\w\-._@:%+~#=\/]*)?(\?[\w\-._@:%+~#=\/&]*)?)/i',
$token
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
htt://example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://.example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http:///example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://exa_mple.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example..com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example=com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example.com:abcd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example.com?this<>=that
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example.com?this=that#this<>=that
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http://example.com/👍
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http//example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
http:/example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
protocol://example.com
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
www.example.com
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Sure, here is the link: <a data-navigate-ignore="true" class="text-blue-500 hover:underline hover:text-blue-700 cursor-pointer" target="_blank" href="https://laravel.com">laravel.com</a> <div
Sure, here is the link: <a data-navigate-ignore="true" class="text-blue-500 hover:underline hover:text-blue-700 cursor-pointer" target="_blank" href="https://laravel.com">laravel.com</a><div
id="link-preview-card"
data-url="https://laravel.com"
class="mx-auto mt-2 min-w-full group/preview" data-navigate-ignore="true"
>
<a href="https://laravel.com" target="_blank" rel="noopener noreferrer">
<!--[if BLOCK]><![endif]--> <a href="https://laravel.com" target="_blank" rel="noopener noreferrer">
<div
title="Click to visit: laravel.com"
class="relative w-full bg-slate-100/90 border border-slate-300
Expand All @@ -25,4 +25,5 @@ Sure, here is the link: <a data-navigate-ignore="true" class="text-blue-500 hove
<a href="https://laravel.com" target="_blank" rel="noopener noreferrer"
class="text-xs text-slate-500 group-hover/preview:text-pink-600">From: laravel.com</a>
</div>
<!--[if ENDBLOCK]><![endif]-->
</div>. Let me know if you have any questions.
23 changes: 23 additions & 0 deletions tests/Unit/Services/ContentProvidersTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,29 @@
],
]);

test('malformed links are correctly handled by content parser', function (string $content) {
$provider = new App\Services\ParsableContentProviders\LinkProviderParsable();
expect($provider->parse($content))->toMatchSnapshot();
})->with([
'http://example..com',
'htt://example.com',
'protocol://example.com',
'http//example.com',
'http://exa_mple.com',
'http://example',
'http://.example.com',
'http://example=com',
'www.example.com',
'http:/example.com',
'http:///example.com',
'http://example.com?this<>=that',
'http://example.com?this=that#this<>=that',
'http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]',
'http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080',
'http://example.com:abcd',
'http://example.com/👍',
]);

test('only http or https urls are converted to links', function (string $content, string $parsed) {
$provider = new App\Services\ParsableContentProviders\LinkProviderParsable();

Expand Down
21 changes: 21 additions & 0 deletions tests/Unit/Services/MetaDataTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
declare(strict_types=1);

use App\Services\MetaData;
use GuzzleHttp\Exception\TransferException;
use GuzzleHttp\Promise\RejectedPromise;
use GuzzleHttp\Psr7\Exception\MalformedUriException;
use Illuminate\Http\Client\ConnectionException;
use Illuminate\Http\Client\HttpClientException;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Http;
Expand Down Expand Up @@ -261,3 +264,21 @@
expect(round(MetaData::CARD_WIDTH / MetaData::CARD_HEIGHT, 2))
->toBe(round(16 / 9, 2));
});

it('handles all exceptions', function (Exception $exception) {
$url = 'https://laravel.com';

Http::fake([
$url => fn ($request) => new RejectedPromise($exception),
]);

$service = new MetaData($url);
$data = $service->fetch();

expect($data->isEmpty())->toBeTrue();
})->with([
new ConnectionException('Connection error'),
new MalformedUriException('Malformed URI'),
new HttpClientException('Not Found'),
new TransferException('Transfer error'),
]);

0 comments on commit 29126de

Please sign in to comment.