-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first commit create dicio classes and tests
- Loading branch information
1 parent
42721da
commit c68c0b1
Showing
7 changed files
with
419 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
composer.phar | ||
/vendor/ | ||
|
||
# Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control | ||
# You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file | ||
# composer.lock | ||
/vendor | ||
composer.lock | ||
.DS_Store | ||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{ | ||
"name": "arthurtavaresdev/dicio-php", | ||
"description": "This project is a crawler of the website dicio.com.br, a dictionary in Portuguese", | ||
"license": "MIT", | ||
"authors": [ | ||
{ | ||
"name": "Arthur Tavares", | ||
"email": "[email protected]" | ||
} | ||
], | ||
"require": { | ||
"php": ">=7.2.5", | ||
"symfony/dom-crawler": "^5.1" , | ||
"symfony/css-selector" : "^5.0", | ||
"guzzlehttp/guzzle": "^7.1@dev" | ||
}, | ||
"require-dev": { | ||
"phpunit/phpunit": "^8.4" | ||
}, | ||
"autoload": { | ||
"classmap": [ | ||
"src" | ||
], | ||
"psr-4": { | ||
"arthurtavaresdev\\Dicio\\": "src" | ||
} | ||
}, | ||
"autoload-dev": { | ||
"psr-4": { | ||
"arthurtavaresdev\\Dicio\\Tests\\": "tests" | ||
} | ||
}, | ||
"minimum-stability": "dev" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
<?php | ||
|
||
require_once 'vendor/autoload.php'; | ||
|
||
|
||
$dicio = new ArthurTavaresDev\Dicio\Dicio(); | ||
|
||
$word = $dicio->search('doce'); | ||
|
||
print_r($word); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
<?php | ||
|
||
|
||
namespace ArthurTavaresDev\Dicio; | ||
|
||
use GuzzleHttp\Exception\GuzzleException; | ||
use Symfony\Component\DomCrawler\Crawler as SymfonyCrawler; | ||
use GuzzleHttp\Client; | ||
|
||
|
||
class Crawler | ||
{ | ||
|
||
/** | ||
* @param String $url | ||
* @param array $params | ||
* @return SymfonyCrawler | ||
* @throws GuzzleException | ||
*/ | ||
public static function page(String $url, array $params = []) | ||
{ | ||
$client = new Client(['verify' => false]); // ignore ssl verify | ||
$response = $client->get($url, ['query' => $params]); | ||
$html = $response->getBody()->getContents(); | ||
return new SymfonyCrawler($html); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
<?php | ||
|
||
namespace ArthurTavaresDev\Dicio; | ||
|
||
use Exception; | ||
use GuzzleHttp\Exception\GuzzleException; | ||
use stdClass; | ||
use Symfony\Component\DomCrawler\Crawler as SymfonyCrawler; | ||
|
||
/** | ||
* Unofficial PHP API for Dicio.com.br | ||
* Class Dicio | ||
* @author Arthur Tavares <[email protected]> | ||
* @package Arthurtavaresdev\Dicio | ||
*/ | ||
class Dicio | ||
{ | ||
const BASE_URL = 'http://www.dicio.com.br/'; | ||
const HTML_SELECTOR_MEANING = '.significado'; | ||
const HTML_SELECTOR_ETYMOLOGY = '.etim'; | ||
const HTML_SELECTOR_SYNONYMS = '.adicional.sinonimos .wrapper'; | ||
const HTML_SELECTOR_EXTRA = '.adicional'; | ||
const HTML_SELECTOR_PHRASE = '.frases .frase'; | ||
|
||
/** | ||
* Search for word. | ||
* Dicio API with meaning, synonyms and extra information. | ||
* @param string $word | ||
* @return stdClass | ||
* @throws Exception | ||
* @throws GuzzleException | ||
*/ | ||
public function search(string $word): stdClass | ||
{ | ||
if (empty($word)) { | ||
throw new Exception('Word not found'); | ||
} | ||
|
||
$url = Utils::format_url(self::BASE_URL) . Utils::clear_string($word); | ||
|
||
$page = Crawler::page($url); | ||
if (!is_object($page)) { | ||
throw new Exception('Error on Crawler'); | ||
} | ||
|
||
return (object)[ | ||
'meaning' => $this->meaning($page), | ||
'etymology' => $this->etymology($page), | ||
'synonyms' => $this->synonyms($page), | ||
'examples' => $this->examples($page), | ||
'extras' => $this->extras($page) | ||
]; | ||
} | ||
|
||
|
||
/** | ||
* Return meaning and etymology. | ||
* @param SymfonyCrawler $page | ||
* @return array | ||
*/ | ||
public function meaning(SymfonyCrawler $page) | ||
{ | ||
$result = $page->filter(self::HTML_SELECTOR_MEANING)->filter('br+span'); | ||
return $result->each(function ($content) { | ||
$meaning = trim($content->text(false)); | ||
if (!$meaning && in_array($content->attr('class'), [self::HTML_SELECTOR_ETYMOLOGY, 'cl'])) { | ||
return false; | ||
} | ||
|
||
return trim($meaning); | ||
}); | ||
} | ||
|
||
public function etymology(SymfonyCrawler $page) | ||
{ | ||
return trim($page->filter(self::HTML_SELECTOR_ETYMOLOGY)->text(null)); | ||
} | ||
|
||
/** | ||
* Return list of synonyms. | ||
* @param SymfonyCrawler $page | ||
* @return array | ||
*/ | ||
public function synonyms(SymfonyCrawler $page) | ||
{ | ||
$result = $page->filter(self::HTML_SELECTOR_SYNONYMS)->text(false); | ||
$synonyms = explode(',', $result); | ||
return array_map('trim', $synonyms); | ||
} | ||
|
||
/** | ||
* Return a list of examples. | ||
* @param SymfonyCrawler $page | ||
* @return array | ||
*/ | ||
public function examples(SymfonyCrawler $page) | ||
{ | ||
$result = $page->filter(self::HTML_SELECTOR_PHRASE); | ||
|
||
return $result->each(function ($content) { | ||
$content = trim($content->text(false)); | ||
if (empty($content)) { | ||
return false; | ||
} | ||
return $content; | ||
|
||
}); | ||
} | ||
|
||
/** | ||
* Return a dictionary of extra information. | ||
* @param SymfonyCrawler $page | ||
*/ | ||
public function extras(SymfonyCrawler $page) | ||
{ | ||
$result = $page->filter(self::HTML_SELECTOR_EXTRA)->filter('span'); | ||
return $result->each(function ($content) { | ||
$content = trim($content->text(false)); | ||
if (empty($content)) { | ||
return false; | ||
} | ||
return $content; | ||
|
||
}); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
<?php | ||
|
||
namespace ArthurTavaresDev\Dicio; | ||
|
||
class Utils | ||
{ | ||
public static function clear_string(string $string) : string { | ||
$clear_string = self::remove_accents($string); | ||
return trim(strtolower($clear_string)); | ||
} | ||
|
||
/** | ||
* Normalize and add a "/" at the end of the url if necessary | ||
* @param string $url | ||
* @return string | ||
*/ | ||
public static function format_url(string $url) : string{ | ||
$clear_url = self::clear_string($url); | ||
$lastChar = substr($url, '-1'); | ||
|
||
return $lastChar === '/' ? $clear_url : $clear_url . '/'; | ||
} | ||
|
||
public static function remove_accents(string $string) : string{ | ||
if (!preg_match('/[\x80-\xff]/', $string)) | ||
{ | ||
return $string; | ||
} | ||
|
||
$chars = array( | ||
// Decompositions for Latin-1 Supplement | ||
chr(195).chr(128) => 'A', chr(195).chr(129) => 'A', | ||
chr(195).chr(130) => 'A', chr(195).chr(131) => 'A', | ||
chr(195).chr(132) => 'A', chr(195).chr(133) => 'A', | ||
chr(195).chr(135) => 'C', chr(195).chr(136) => 'E', | ||
chr(195).chr(137) => 'E', chr(195).chr(138) => 'E', | ||
chr(195).chr(139) => 'E', chr(195).chr(140) => 'I', | ||
chr(195).chr(141) => 'I', chr(195).chr(142) => 'I', | ||
chr(195).chr(143) => 'I', chr(195).chr(145) => 'N', | ||
chr(195).chr(146) => 'O', chr(195).chr(147) => 'O', | ||
chr(195).chr(148) => 'O', chr(195).chr(149) => 'O', | ||
chr(195).chr(150) => 'O', chr(195).chr(153) => 'U', | ||
chr(195).chr(154) => 'U', chr(195).chr(155) => 'U', | ||
chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y', | ||
chr(195).chr(159) => 's', chr(195).chr(160) => 'a', | ||
chr(195).chr(161) => 'a', chr(195).chr(162) => 'a', | ||
chr(195).chr(163) => 'a', chr(195).chr(164) => 'a', | ||
chr(195).chr(165) => 'a', chr(195).chr(167) => 'c', | ||
chr(195).chr(168) => 'e', chr(195).chr(169) => 'e', | ||
chr(195).chr(170) => 'e', chr(195).chr(171) => 'e', | ||
chr(195).chr(172) => 'i', chr(195).chr(173) => 'i', | ||
chr(195).chr(174) => 'i', chr(195).chr(175) => 'i', | ||
chr(195).chr(177) => 'n', chr(195).chr(178) => 'o', | ||
chr(195).chr(179) => 'o', chr(195).chr(180) => 'o', | ||
chr(195).chr(181) => 'o', chr(195).chr(182) => 'o', | ||
chr(195).chr(182) => 'o', chr(195).chr(185) => 'u', | ||
chr(195).chr(186) => 'u', chr(195).chr(187) => 'u', | ||
chr(195).chr(188) => 'u', chr(195).chr(189) => 'y', | ||
chr(195).chr(191) => 'y', | ||
// Decompositions for Latin Extended-A | ||
chr(196).chr(128) => 'A', chr(196).chr(129) => 'a', | ||
chr(196).chr(130) => 'A', chr(196).chr(131) => 'a', | ||
chr(196).chr(132) => 'A', chr(196).chr(133) => 'a', | ||
chr(196).chr(134) => 'C', chr(196).chr(135) => 'c', | ||
chr(196).chr(136) => 'C', chr(196).chr(137) => 'c', | ||
chr(196).chr(138) => 'C', chr(196).chr(139) => 'c', | ||
chr(196).chr(140) => 'C', chr(196).chr(141) => 'c', | ||
chr(196).chr(142) => 'D', chr(196).chr(143) => 'd', | ||
chr(196).chr(144) => 'D', chr(196).chr(145) => 'd', | ||
chr(196).chr(146) => 'E', chr(196).chr(147) => 'e', | ||
chr(196).chr(148) => 'E', chr(196).chr(149) => 'e', | ||
chr(196).chr(150) => 'E', chr(196).chr(151) => 'e', | ||
chr(196).chr(152) => 'E', chr(196).chr(153) => 'e', | ||
chr(196).chr(154) => 'E', chr(196).chr(155) => 'e', | ||
chr(196).chr(156) => 'G', chr(196).chr(157) => 'g', | ||
chr(196).chr(158) => 'G', chr(196).chr(159) => 'g', | ||
chr(196).chr(160) => 'G', chr(196).chr(161) => 'g', | ||
chr(196).chr(162) => 'G', chr(196).chr(163) => 'g', | ||
chr(196).chr(164) => 'H', chr(196).chr(165) => 'h', | ||
chr(196).chr(166) => 'H', chr(196).chr(167) => 'h', | ||
chr(196).chr(168) => 'I', chr(196).chr(169) => 'i', | ||
chr(196).chr(170) => 'I', chr(196).chr(171) => 'i', | ||
chr(196).chr(172) => 'I', chr(196).chr(173) => 'i', | ||
chr(196).chr(174) => 'I', chr(196).chr(175) => 'i', | ||
chr(196).chr(176) => 'I', chr(196).chr(177) => 'i', | ||
chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij', | ||
chr(196).chr(180) => 'J', chr(196).chr(181) => 'j', | ||
chr(196).chr(182) => 'K', chr(196).chr(183) => 'k', | ||
chr(196).chr(184) => 'k', chr(196).chr(185) => 'L', | ||
chr(196).chr(186) => 'l', chr(196).chr(187) => 'L', | ||
chr(196).chr(188) => 'l', chr(196).chr(189) => 'L', | ||
chr(196).chr(190) => 'l', chr(196).chr(191) => 'L', | ||
chr(197).chr(128) => 'l', chr(197).chr(129) => 'L', | ||
chr(197).chr(130) => 'l', chr(197).chr(131) => 'N', | ||
chr(197).chr(132) => 'n', chr(197).chr(133) => 'N', | ||
chr(197).chr(134) => 'n', chr(197).chr(135) => 'N', | ||
chr(197).chr(136) => 'n', chr(197).chr(137) => 'N', | ||
chr(197).chr(138) => 'n', chr(197).chr(139) => 'N', | ||
chr(197).chr(140) => 'O', chr(197).chr(141) => 'o', | ||
chr(197).chr(142) => 'O', chr(197).chr(143) => 'o', | ||
chr(197).chr(144) => 'O', chr(197).chr(145) => 'o', | ||
chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe', | ||
chr(197).chr(148) => 'R',chr(197).chr(149) => 'r', | ||
chr(197).chr(150) => 'R',chr(197).chr(151) => 'r', | ||
chr(197).chr(152) => 'R',chr(197).chr(153) => 'r', | ||
chr(197).chr(154) => 'S',chr(197).chr(155) => 's', | ||
chr(197).chr(156) => 'S',chr(197).chr(157) => 's', | ||
chr(197).chr(158) => 'S',chr(197).chr(159) => 's', | ||
chr(197).chr(160) => 'S', chr(197).chr(161) => 's', | ||
chr(197).chr(162) => 'T', chr(197).chr(163) => 't', | ||
chr(197).chr(164) => 'T', chr(197).chr(165) => 't', | ||
chr(197).chr(166) => 'T', chr(197).chr(167) => 't', | ||
chr(197).chr(168) => 'U', chr(197).chr(169) => 'u', | ||
chr(197).chr(170) => 'U', chr(197).chr(171) => 'u', | ||
chr(197).chr(172) => 'U', chr(197).chr(173) => 'u', | ||
chr(197).chr(174) => 'U', chr(197).chr(175) => 'u', | ||
chr(197).chr(176) => 'U', chr(197).chr(177) => 'u', | ||
chr(197).chr(178) => 'U', chr(197).chr(179) => 'u', | ||
chr(197).chr(180) => 'W', chr(197).chr(181) => 'w', | ||
chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y', | ||
chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z', | ||
chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z', | ||
chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z', | ||
chr(197).chr(190) => 'z', chr(197).chr(191) => 's' | ||
); | ||
|
||
return strtr($string, $chars); | ||
} | ||
} |
Oops, something went wrong.