-
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor scraper to make it find stuff, no matter how bad it's formatted
- Loading branch information
Showing
6 changed files
with
973 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
<?php declare(strict_types=1); | ||
|
||
/** | ||
* @license http://opensource.org/licenses/mit-license.php MIT | ||
* @link https://github.com/nicoSWD | ||
* @author Nicolas Oelgart <[email protected]> | ||
*/ | ||
namespace nicoSWD\IfscCalendar\Domain\Event\Helpers; | ||
|
||
use DOMDocument; | ||
use DOMNodeList; | ||
use DOMXPath; | ||
|
||
final readonly class DOMHelper | ||
{ | ||
private const XPATH_PARAGRAPHS = "//*[@id='ifsc_event']/div/div/div[@class='text']/p"; | ||
|
||
private const POSTER_IMAGE_PREFIX = 'https://cdn.ifsc-climbing.org/images/Events/'; | ||
|
||
private const XPATH_SIDEBAR = "//div[@class='text2']"; | ||
|
||
public function htmlToDom(string $html): DOMXPath | ||
{ | ||
$lastValue = libxml_use_internal_errors(true); | ||
|
||
$dom = new DOMDocument(); | ||
$dom->loadHTML($this->normalizeHtml($html)); | ||
|
||
libxml_use_internal_errors($lastValue); | ||
|
||
return new DOMXPath($dom); | ||
} | ||
|
||
public function getParagraphs(DOMXPath $xpath): DOMNodeList | ||
{ | ||
return $xpath->query(self::XPATH_PARAGRAPHS); | ||
} | ||
|
||
public function getPoster(DOMXPath $xpath): string | ||
{ | ||
$sideBar = $xpath->query(self::XPATH_SIDEBAR)->item(0); | ||
|
||
if (!$sideBar) { | ||
return ''; | ||
} | ||
|
||
$images = $sideBar->getElementsByTagName('img'); | ||
|
||
if (!is_iterable($images)) { | ||
return ''; | ||
} | ||
|
||
foreach ($images as $image) { | ||
foreach ($image->attributes as $name => $attribute) { | ||
if ($name === 'data-src' && str_starts_with($attribute->textContent, self::POSTER_IMAGE_PREFIX)) { | ||
return (string) $attribute->textContent; | ||
} | ||
} | ||
} | ||
|
||
return ''; | ||
} | ||
|
||
public function normalizeHtml(string $html): string | ||
{ | ||
// This makes `textContent` to display each event in a new line, and thereby easier to parse | ||
$html = preg_replace('~<br\s*/?>~i', "<br/>\n", $html); | ||
// This replaces named links with just their blank URL | ||
return preg_replace('~<a[\s\r\n]+href=\s*(")?([\w:\-./\?=]+)\s*[^>]*>(.*?)</a>~s', '$2', $html); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<?php declare(strict_types=1); | ||
|
||
/** | ||
* @license http://opensource.org/licenses/mit-license.php MIT | ||
* @link https://github.com/nicoSWD | ||
* @author Nicolas Oelgart <[email protected]> | ||
*/ | ||
namespace nicoSWD\IfscCalendar\Domain\Event\Helpers; | ||
|
||
final readonly class Normalizer | ||
{ | ||
public function leagueName(string $league): string | ||
{ | ||
return ucwords(strtolower($league)); | ||
} | ||
|
||
public function normalizeTime(string $time): string | ||
{ | ||
if (in_array($time, ['TBC', 'TBD'], strict: true)) { | ||
// We don't know the exact time yet. We'll set it to 8:00 for now | ||
// as it will automatically update once IFSC sets it | ||
$time = '8:00'; | ||
} | ||
|
||
return $time; | ||
} | ||
|
||
public function nonEmptyLines(string $matches): array | ||
{ | ||
return preg_split("~[\r\n]+~", $matches, flags: PREG_SPLIT_NO_EMPTY); | ||
} | ||
|
||
public function removeNonAsciiCharacters(string $text): string | ||
{ | ||
return preg_replace('~[^\w\s\'\r\n:,-\./\?=]+~', ' ', $text); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.