Skip to content

Commit

Permalink
Implemented maxDurationInSeconds()
Browse files Browse the repository at this point in the history
  • Loading branch information
Toflar committed Nov 9, 2023
1 parent 8417892 commit c796c02
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 13 deletions.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"nyholm/psr7": "^1.1",
"psr/http-message": "^1.0 || ^2.0",
"psr/log": "^1.1 || ^2.0 || ^3.0",
"symfony/clock": "^6.2",
"symfony/dom-crawler": "^5.4 || ^6.0",
"symfony/event-dispatcher": "^5.4 || ^6.0",
"symfony/http-client": "^5.4 || ^6.0",
Expand Down
64 changes: 61 additions & 3 deletions src/Escargot.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@

namespace Terminal42\Escargot;

use Nyholm\Psr7\Uri;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerAwareInterface;
use Psr\Log\LoggerInterface;
use Psr\Log\LogLevel;
use Symfony\Component\Clock\ClockInterface;
use Symfony\Component\Clock\NativeClock;
use Symfony\Component\HttpClient\HttpClient;
use Symfony\Contracts\HttpClient\ChunkInterface;
use Symfony\Contracts\HttpClient\Exception\ExceptionInterface;
Expand All @@ -41,6 +42,11 @@ final class Escargot
*/
private $queue;

/**
* @var ClockInterface
*/
private $clock;

/**
* @var string
*/
Expand Down Expand Up @@ -81,6 +87,16 @@ final class Escargot
*/
private $maxRequests = 0;

/**
* Maximum number of duration in seconds
* Escargot is going to work on requests.
*
* 0 means no limit.
*
* @var int
*/
private $maxDurationInSeconds = 0;

/**
* Request delay in microseconds.
* 0 means no delay.
Expand Down Expand Up @@ -125,12 +141,15 @@ final class Escargot
*/
private $decisionMap = ['shouldRequest' => [], 'needsContent' => []];

private \DateTimeImmutable $startTime;

private function __construct(QueueInterface $queue, string $jobId, BaseUriCollection $baseUris)
{
$this->queue = $queue;
$this->jobId = $jobId;
$this->baseUris = $baseUris;

$this->clock = new NativeClock();
$this->userAgent = self::DEFAULT_USER_AGENT;
}

Expand Down Expand Up @@ -187,6 +206,22 @@ public function withMaxRequests(int $maxRequests): self
return $new;
}

public function withMaxDurationInSeconds(int $maxDurationInSeconds): self
{
$new = clone $this;
$new->maxDurationInSeconds = $maxDurationInSeconds;

return $new;
}

public function withClock(ClockInterface $clock): self
{
$new = clone $this;
$new->clock = $clock;

return $new;
}

public function withConcurrency(int $concurrency): self
{
$new = clone $this;
Expand Down Expand Up @@ -320,6 +355,8 @@ public static function create(BaseUriCollection $baseUris, QueueInterface $queue

public function crawl(): void
{
$this->startTime = $this->clock->now();

while (true) {
$responses = $this->prepareResponses();

Expand Down Expand Up @@ -529,8 +566,20 @@ private function prepareResponses(): array
{
$responses = [];

$hasMaxRequestsReached = $this->isMaxRequestsReached();
$hasMaxDurationReached = $this->isMaxDurationInSecondsReached();

if ($hasMaxRequestsReached) {
$this->log(LogLevel::DEBUG, 'Configured max requests reached!');
}

if ($hasMaxDurationReached) {
$this->log(LogLevel::DEBUG, 'Configured max duration reached!');
}

while (!$this->isMaxConcurrencyReached()
&& !$this->isMaxRequestsReached()
&& !$hasMaxRequestsReached
&& !$hasMaxDurationReached
&& ($crawlUri = $this->queue->getNext($this->jobId))
) {
// Already processed, ignore
Expand Down Expand Up @@ -570,7 +619,7 @@ private function prepareResponses(): array

// Request delay
if (0 !== $this->requestDelay) {
usleep($this->requestDelay);
$this->clock->sleep($this->requestDelay / 1000000);
}

try {
Expand Down Expand Up @@ -604,6 +653,15 @@ private function isMaxRequestsReached(): bool
return 0 !== $this->maxRequests && $this->requestsSent >= $this->maxRequests;
}

private function isMaxDurationInSecondsReached(): bool
{
if (0 === $this->maxDurationInSeconds) {
return false;
}

return $this->clock->now() >= ($this->startTime->add(new \DateInterval('PT' . $this->maxDurationInSeconds . 'S')));
}

private function isMaxConcurrencyReached(): bool
{
return \count($this->runningRequests) >= $this->concurrency;
Expand Down
71 changes: 61 additions & 10 deletions tests/EscargotTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
use Psr\Log\LoggerInterface;
use Psr\Log\LogLevel;
use Psr\Log\Test\TestLogger;
use Symfony\Component\Clock\MockClock;
use Symfony\Component\Finder\Finder;
use Symfony\Component\HttpClient\MockHttpClient;
use Symfony\Contracts\HttpClient\ChunkInterface;
Expand All @@ -37,6 +38,7 @@
use Terminal42\Escargot\Subscriber\TagValueResolvingSubscriberInterface;
use Terminal42\Escargot\SubscriberLogger;
use Terminal42\Escargot\SubscriberLoggerTrait;
use Terminal42\Escargot\Tests\Scenario\MockResponseFactory;
use Terminal42\Escargot\Tests\Scenario\Scenario;

class EscargotTest extends TestCase
Expand Down Expand Up @@ -180,6 +182,50 @@ public function resolveTagValue(string $tag)
$this->assertSame('success', $escargot->resolveTagValue('foobar'));
}

public function testMaxDuration(): void
{
$mockResponse = <<<HTML
HTTP/2.0 200 OK
content-type: text/html; charset=UTF-8
<html>
<head>
</head>
<body>
<a href="https://www.terminal42.ch/%s">Link</a>
</body>
</html>
HTML;

$baseUris = new BaseUriCollection();
$baseUris->add(new Uri('https://www.terminal42.ch'));
$queue = new InMemoryQueue();
$clock = new MockClock();
$client = new MockHttpClient(function ($method, $url) use ($clock, $mockResponse) {
$clock->sleep(1); // Mock the request that takes a second to complete

return MockResponseFactory::createFromString(sprintf($mockResponse, uniqid()));
});
$logger = new TestLogger();

$escargot = Escargot::create($baseUris, $queue)
->withLogger($logger)
->withHttpClient($client)
->withMaxDurationInSeconds(5)
->withClock($clock)
;

$escargot->addSubscriber(new HtmlCrawlerSubscriber());
$escargot->addSubscriber($this->getSearchIndexSubscriber());

$escargot->crawl();

$this->assertSame([
'[Terminal42\Escargot\Escargot] Configured max duration reached!',
'[Terminal42\Escargot\Escargot] Finished crawling! Sent 5 request(s).',
], $this->cleanLogs($logger));
}

/**
* @dataProvider crawlProvider
*/
Expand Down Expand Up @@ -216,7 +262,20 @@ public function testCrawlAsWebCrawler(\Closure $responseFactory, array $expected

$escargot->crawl();

$filteredLogs = array_map(function (array $record) {
$filteredLogs = $this->cleanLogs($logger);

$this->assertSame($expectedLogs, $filteredLogs, $message);

$filteredRequests = array_map(function (CrawlUri $crawlUri) {
return sprintf('Successful request! %s.', (string) $crawlUri);
}, $indexerSubscriber->getUris());

$this->assertSame($expectedRequests, $filteredRequests, $message);
}

private function cleanLogs(TestLogger $testLogger): array
{
return array_map(function (array $record) {
$message = $record['message'];

if (isset($record['context']['crawlUri'])) {
Expand All @@ -228,15 +287,7 @@ public function testCrawlAsWebCrawler(\Closure $responseFactory, array $expected
}

return $message;
}, $logger->records);

$this->assertSame($expectedLogs, $filteredLogs, $message);

$filteredRequests = array_map(function (CrawlUri $crawlUri) {
return sprintf('Successful request! %s.', (string) $crawlUri);
}, $indexerSubscriber->getUris());

$this->assertSame($expectedRequests, $filteredRequests, $message);
}, $testLogger->records);
}

public function crawlProvider(): \Generator
Expand Down
1 change: 1 addition & 0 deletions tests/Fixtures/scenario6/_logs.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
[Terminal42\Escargot\Escargot] Configured max requests reached!
[Terminal42\Escargot\Escargot] Finished crawling! Sent 2 request(s).

0 comments on commit c796c02

Please sign in to comment.