From 89c66c2da45de3219a6c36206ca58ebab9020ec1 Mon Sep 17 00:00:00 2001 From: Nico Date: Sat, 6 May 2023 01:00:51 +0200 Subject: [PATCH] Refactor scraper to make it find stuff, no matter how bad it's formatted --- app/config/services.yml | 8 + src/Domain/Event/Helpers/DOMHelper.php | 71 ++ src/Domain/Event/Helpers/Normalizer.php | 37 + src/Domain/Event/IFSCEventsScraper.php | 139 +--- tests/html/jakata_2023_malformed.html | 784 ++++++++++++++++++ .../Domain/Event/IFSCEventsScraperTest.php | 35 +- 6 files changed, 973 insertions(+), 101 deletions(-) create mode 100644 src/Domain/Event/Helpers/DOMHelper.php create mode 100644 src/Domain/Event/Helpers/Normalizer.php create mode 100644 tests/html/jakata_2023_malformed.html diff --git a/app/config/services.yml b/app/config/services.yml index 5b93367..ad63ed0 100644 --- a/app/config/services.yml +++ b/app/config/services.yml @@ -86,6 +86,14 @@ services: class: nicoSWD\IfscCalendar\Domain\Event\IFSCEventsScraper arguments: - '@nicoSWD\IfscCalendar\Infrastructure\HttpClient\HttpGuzzleClient' + - '@nicoSWD\IfscCalendar\Domain\Event\Helpers\DOMHelper' + - '@nicoSWD\IfscCalendar\Domain\Event\Helpers\Normalizer' + + nicoSWD\IfscCalendar\Domain\Event\Helpers\DOMHelper: + class: nicoSWD\IfscCalendar\Domain\Event\Helpers\DOMHelper + + nicoSWD\IfscCalendar\Domain\Event\Helpers\Normalizer: + class: nicoSWD\IfscCalendar\Domain\Event\Helpers\Normalizer # HTTP Client nicoSWD\IfscCalendar\Infrastructure\HttpClient\HttpGuzzleClient: diff --git a/src/Domain/Event/Helpers/DOMHelper.php b/src/Domain/Event/Helpers/DOMHelper.php new file mode 100644 index 0000000..3fe7c16 --- /dev/null +++ b/src/Domain/Event/Helpers/DOMHelper.php @@ -0,0 +1,71 @@ + + */ +namespace nicoSWD\IfscCalendar\Domain\Event\Helpers; + +use DOMDocument; +use DOMNodeList; +use DOMXPath; + +final readonly class DOMHelper +{ + private const XPATH_PARAGRAPHS = "//*[@id='ifsc_event']/div/div/div[@class='text']/p"; + + private const POSTER_IMAGE_PREFIX = 'https://cdn.ifsc-climbing.org/images/Events/'; + + private const XPATH_SIDEBAR = "//div[@class='text2']"; + + public function htmlToDom(string $html): DOMXPath + { + $lastValue = libxml_use_internal_errors(true); + + $dom = new DOMDocument(); + $dom->loadHTML($this->normalizeHtml($html)); + + libxml_use_internal_errors($lastValue); + + return new DOMXPath($dom); + } + + public function getParagraphs(DOMXPath $xpath): DOMNodeList + { + return $xpath->query(self::XPATH_PARAGRAPHS); + } + + public function getPoster(DOMXPath $xpath): string + { + $sideBar = $xpath->query(self::XPATH_SIDEBAR)->item(0); + + if (!$sideBar) { + return ''; + } + + $images = $sideBar->getElementsByTagName('img'); + + if (!is_iterable($images)) { + return ''; + } + + foreach ($images as $image) { + foreach ($image->attributes as $name => $attribute) { + if ($name === 'data-src' && str_starts_with($attribute->textContent, self::POSTER_IMAGE_PREFIX)) { + return (string) $attribute->textContent; + } + } + } + + return ''; + } + + public function normalizeHtml(string $html): string + { + // This makes `textContent` to display each event in a new line, and thereby easier to parse + $html = preg_replace('~~i', "
\n", $html); + // This replaces named links with just their blank URL + return preg_replace('~]*>(.*?)~s', '$2', $html); + } +} diff --git a/src/Domain/Event/Helpers/Normalizer.php b/src/Domain/Event/Helpers/Normalizer.php new file mode 100644 index 0000000..9ab65bc --- /dev/null +++ b/src/Domain/Event/Helpers/Normalizer.php @@ -0,0 +1,37 @@ + + */ +namespace nicoSWD\IfscCalendar\Domain\Event\Helpers; + +final readonly class Normalizer +{ + public function leagueName(string $league): string + { + return ucwords(strtolower($league)); + } + + public function normalizeTime(string $time): string + { + if (in_array($time, ['TBC', 'TBD'], strict: true)) { + // We don't know the exact time yet. We'll set it to 8:00 for now + // as it will automatically update once IFSC sets it + $time = '8:00'; + } + + return $time; + } + + public function nonEmptyLines(string $matches): array + { + return preg_split("~[\r\n]+~", $matches, flags: PREG_SPLIT_NO_EMPTY); + } + + public function removeNonAsciiCharacters(string $text): string + { + return preg_replace('~[^\w\s\'\r\n:,-\./\?=]+~', ' ', $text); + } +} diff --git a/src/Domain/Event/IFSCEventsScraper.php b/src/Domain/Event/IFSCEventsScraper.php index fa10737..2057af6 100644 --- a/src/Domain/Event/IFSCEventsScraper.php +++ b/src/Domain/Event/IFSCEventsScraper.php @@ -10,25 +10,19 @@ use DateTime; use DateTimeImmutable; use DateTimeZone; -use DOMDocument; -use DOMElement; -use DOMNode; -use DOMNodeList; use DOMXPath; +use nicoSWD\IfscCalendar\Domain\Event\Helpers\DOMHelper; +use nicoSWD\IfscCalendar\Domain\Event\Helpers\Normalizer; use nicoSWD\IfscCalendar\Domain\HttpClient\HttpClientInterface; final readonly class IFSCEventsScraper { - private const XPATH_PARAGRAPHS = "//*[@id='ifsc_event']/div/div/div[@class='text']/p"; - - private const XPATH_SIDEBAR = "//div[@class='text2']"; - private const IFSC_EVENT_PAGE_URL = 'https://www.ifsc-climbing.org/component/ifsc/?view=event&WetId=%d'; - private const POSTER_IMAGE_PREFIX = 'https://cdn.ifsc-climbing.org/images/Events/'; - public function __construct( private HttpClientInterface $client, + private DOMHelper $domHelper, + private Normalizer $normalizer, ) { } @@ -36,29 +30,31 @@ public function __construct( public function fetchEventsForLeague(int $season, int $eventId, string $timezone, string $eventName): array { $xpath = $this->getXPathForEventsWithId($eventId); - $paragraphs = $this->getParagraphs($xpath); $dateRegex = $this->buildDateRegex(); $schedules = []; - foreach ($paragraphs as $paragraph) { - if (preg_match($dateRegex, trim($paragraph->nodeValue), matches: $date)) { - foreach ($paragraph->getElementsByTagName('em') as $span) { - $currentEventName = $this->trim($span->nextSibling->nodeValue); - $time = $this->parseTimeFromSpan($span, $eventId); + foreach ($this->domHelper->getParagraphs($xpath) as $paragraph) { + if (!preg_match_all($dateRegex, $this->normalizer->removeNonAsciiCharacters($paragraph->textContent), $matches)) { + continue; + } + + foreach ($matches['day'] as $key => $match) { + foreach ($this->normalizer->nonEmptyLines($matches['times'][$key]) as $line) { + [$eventTime, $eventName2, $link] = $this->parseTimeAndName($line); $schedules[] = IFSCSchedule::create( - day: (int) $date['day'], - month: Month::fromName($date['month']), - time: $time, + day: (int) $matches['day'][$key], + month: Month::fromName($matches['month'][$key]), + time: $this->normalizer->normalizeTime($eventTime), season: $season, - league: $this->leagueName($currentEventName), - url: $this->getEventUrl($span->parentNode), + league: $this->normalizer->leagueName($eventName2), + url: $link, ); } } } - $poster = $this->getPoster($xpath); + $poster = $this->domHelper->getPoster($xpath); $events = []; foreach ($schedules as $schedule) { @@ -79,60 +75,11 @@ public function fetchEventsForLeague(int $season, int $eventId, string $timezone return $events; } - private function getEventUrl(DOMNode $span): string - { - $links = $span->getElementsByTagName('a'); - - if ($links->length > 0) { - $url = (string) $links->item(0)->getAttribute('href'); - } else { - $url = ''; - } - - return $url; - } - private function getXPathForEventsWithId(int $eventId): DOMXPath { - $htmlResponse = $this->client->get($this->buildLeagueUri($eventId)); - $lastValue = libxml_use_internal_errors(true); - - $dom = new DOMDocument(); - $dom->loadHTML($htmlResponse); - - libxml_use_internal_errors($lastValue); - - return new DOMXPath($dom); - } - - private function getParagraphs(DOMXPath $xpath): DOMNodeList - { - return $xpath->query(self::XPATH_PARAGRAPHS); - } - - private function getPoster(DOMXPath $xpath): string - { - $sideBar = $xpath->query(self::XPATH_SIDEBAR)->item(0); - - if (!$sideBar) { - return ''; - } - - $images = $sideBar->getElementsByTagName('img'); - - if (!is_iterable($images)) { - return ''; - } - - foreach ($images as $image) { - foreach ($image->attributes as $name => $attribute) { - if ($name === 'data-src' && str_starts_with($attribute->textContent, self::POSTER_IMAGE_PREFIX)) { - return (string) $attribute->textContent; - } - } - } - - return ''; + return $this->domHelper->htmlToDom( + $this->client->get($this->buildLeagueUri($eventId)) + ); } private function getStartDateTime(IFSCSchedule $schedule, string $timezone): DateTimeImmutable @@ -160,26 +107,11 @@ private function buildDateRegex(): string $months = implode('|', Month::monthNames()); return "~ - ^(?:MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY), - \s+(?\d{1,2}) - \s+(?$months) - ~x"; - } - - /** @throws IFSCEventsScraperException */ - public function parseTimeFromSpan(DOMElement $span, int $eventId): string - { - $time = $this->trim($span->nodeValue); - - if (in_array($time, ['TBC', 'TBD'], strict: true)) { - // set arbitrary time for now. It will eventually update automatically - // once IFSC sets the correct time. Sometimes it's set to `TBC` or `TBD` - $time = '8:00'; - } elseif (!preg_match('~^\d{1,2}:\d{2}$~', $time)) { - throw IFSCEventsScraperException::timeParseExceptionForEventWithId($time, $eventId); - } - - return $time; + (?:MONDAY|TUESDAY|WEDNESDAY|THURSDAY|FRIDAY|SATURDAY|SUNDAY),\s+ + (?\d{1,2})\s+ + (?$months):[\r\n]* + (?((\d{1,2}:\d{2}|TBD|TBC)\s+[^\r\n]+[\r\n]*)+) + ~xsi"; } private function buildLeagueUri(int $id): string @@ -187,13 +119,20 @@ private function buildLeagueUri(int $id): string return sprintf(self::IFSC_EVENT_PAGE_URL, $id); } - private function leagueName(string $league): string + private function parseTimeAndName(string $line): array { - return ucwords(strtolower(trim($league))); - } + $parts = preg_split('~(\s{2,}|\s\W+\s)~', $line, flags: PREG_SPLIT_NO_EMPTY); - private function trim(string $string): string - { - return preg_replace(['~^\W+~', '~\W+$~'], '', trim($string)); + if (count($parts) >= 3) { + [$time, $eventName, $streamUrl] = $parts; + } else { + [$time, $eventName] = $parts; + } + + return [ + $time, + $eventName, + $streamUrl ?? '', + ]; } } diff --git a/tests/html/jakata_2023_malformed.html b/tests/html/jakata_2023_malformed.html new file mode 100644 index 0000000..5b9c3cb --- /dev/null +++ b/tests/html/jakata_2023_malformed.html @@ -0,0 +1,784 @@ + + + + + + + + + + + + + + + IFSC - Climbing World Cup (S) - Jakarta (INA) 2023 + + + + + + + + + + + + + + + +
+ +
+
+
+
+
+ +
+
+ +
+
+
+
+
    +
  • +
  • +
  • +
  • +
  • +
  • +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

IFSC - Climbing World Cup (S) - Jakarta (INA) 2023

+

06 - 07 May 2023

+ +

Last year, the Lead and Speed seasons closed in Jakarta, Indonesia, with the city's downtown serving as the spectacular background for the second-to-last event of the IFSC World Cup Series 2022In 2023, the capital city of Indonesia is back on the calendar with a standalone Speed event. +

+

SCHEDULE (UTC+7:00):

+

SATURDAY, 6 MAY:
18:15   SPEED QUALIFICATIONS +

+

SUNDAY, 7 MAY:
20:00   SPEED FINALS

+
+

Lost? How to find us

+
+ + +
+
+

230415 Poster JAK23v2 + +

+
+
+
+
+
+
+
+
+
+
+
+
+

Member of

+
+
+
+
+
+
Social Media
+
+
+
+
+
+
+
+
INSTITUTIONAL
+ +
STAFF
+ +
Commissions
+ +
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
+

© 2019 ifsc-climbing.org  +   All Rights Reserved

+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/tests/unit/Domain/Event/IFSCEventsScraperTest.php b/tests/unit/Domain/Event/IFSCEventsScraperTest.php index 5f932fd..6d03779 100644 --- a/tests/unit/Domain/Event/IFSCEventsScraperTest.php +++ b/tests/unit/Domain/Event/IFSCEventsScraperTest.php @@ -5,6 +5,9 @@ * @link https://github.com/nicoSWD * @author Nicolas Oelgart */ + +use nicoSWD\IfscCalendar\Domain\Event\Helpers\DOMHelper; +use nicoSWD\IfscCalendar\Domain\Event\Helpers\Normalizer; use nicoSWD\IfscCalendar\Domain\Event\IFSCEvent; use nicoSWD\IfscCalendar\Domain\Event\IFSCEventsScraper; use nicoSWD\IfscCalendar\Domain\HttpClient\HttpClientInterface; @@ -96,7 +99,7 @@ public function malformed_seoul_events_are_found(): void $this->assertSame('Boulder Semi-finals', $event5->name); $this->assertSame('2023-04-30T18:00:00+09:00', $this->formatDate($event5->startTime)); $this->assertSame('2023-04-30T21:00:00+09:00', $this->formatDate($event5->endTime)); - $this->assertSame('https://youtu.be/emrHdLsJTk4', $event5->streamUrl); + $this->assertSame('https://youtube.com/live/4ZfaojD52K4', $event5->streamUrl); } #[Test] @@ -127,6 +130,34 @@ public function well_formatted_jakata_events_are_found(): void $this->assertSame('', $event2->streamUrl); } + #[Test] + public function malformed_jakata_events_are_found(): void + { + $events = $this->fetchEventsFromFile( + fileName: 'jakata_2023_malformed.html', + timeZone: 'Asia/Jakarta', + eventName: 'IFSC - Climbing World Cup (S) - Jakarta (INA) 2023', + ); + + $this->assertCount(2, $events); + + [$event1, $event2] = $events; + + $this->assertSame(1249, $event1->id); + $this->assertSame('https://cdn.ifsc-climbing.org/images/Events/2023/230506_Jakarta_WC/230415_Poster_JAK23v2.jpg', $event1->poster); + $this->assertSame('IFSC - Climbing World Cup (S) - Jakarta (INA) 2023', $event1->description); + + $this->assertSame('Speed Qualifications', $event1->name); + $this->assertSame('2023-05-06T18:15:00+07:00', $this->formatDate($event1->startTime)); + $this->assertSame('2023-05-06T21:15:00+07:00', $this->formatDate($event1->endTime)); + $this->assertSame('', $event1->streamUrl); + + $this->assertSame('Speed Finals', $event2->name); + $this->assertSame('2023-05-07T20:00:00+07:00', $this->formatDate($event2->startTime)); + $this->assertSame('2023-05-07T23:00:00+07:00', $this->formatDate($event2->endTime)); + $this->assertSame('', $event2->streamUrl); + } + #[Test] public function well_formatted_salt_lake_city_events_are_found(): void { @@ -215,6 +246,8 @@ private function fetchEventsFromFile(string $fileName, string $timeZone, string { $eventScraper = new IFSCEventsScraper( $this->mockClientReturningFile($fileName), + new DOMHelper(), + new Normalizer(), ); return $eventScraper->fetchEventsForLeague(