Browse Source

TASK: Filter unnecessary urls.

feature/kickstart
Daniel Siepmann 4 years ago
parent
commit
c35ea7c8ef
Signed by: Daniel Siepmann
GPG Key ID: 33D6629915560EF4
  1. 20
      src/Service/Screenshot/CrawlerService.php

20
src/Service/Screenshot/CrawlerService.php

@ -57,6 +57,7 @@ class CrawlerService
public function crawl(UrlListDto $linkList)
{
while ($url = $linkList->getNextUrl()) {
$uri = new Uri($url);
$this->driver->get($url);
$screenshotHeight = $this->driver->findElement(WebDriverBy::cssSelector('body'))
->getSize()
@ -114,7 +115,24 @@ class CrawlerService
'',
(new Uri($this->baseUrl))->getHost(),
];
$validSchemas = [
'http',
'https',
];
$invalidFileExtensions = [
'.pdf',
'.jpg',
'.gif',
'.svg',
];
$pathEnding = substr($uri->getPath(), -4);
return in_array($uri->getHost(), $validHosts);
return in_array($uri->getHost(), $validHosts)
&& in_array($uri->getScheme(), $validSchemas)
&& !in_array($pathEnding, $invalidFileExtensions)
&& strpos((string) $uri, 'eID=') === false
;
}
}

Loading…
Cancel
Save