-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-names.php
84 lines (69 loc) · 2.45 KB
/
parse-names.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
<?php
/**
* Parser to get translated names of divisions from Wikipedia:
*
* To execute run the command in shell `php parse-names.php`
* The result will be saved in files `result/names.csv`
*
* @link https://github.com/tigrov/wikipedia-divisions
* @author Sergei Tigrov <[email protected]>
*/
require(__DIR__ . '/simple_html_dom.php');
define('RESULT_DIR', __DIR__ . DIRECTORY_SEPARATOR . 'result');
define('CSV_DELIMITER', ';');
// CSV file headers
$namesHeader = ['ISO-3166-1', 'ISO-3166-2', 'language_code', 'value', 'wikipedia'];
$namesCsv = fopen(RESULT_DIR . DIRECTORY_SEPARATOR . 'names.csv', 'w');
$divisionsCsv = fopen(RESULT_DIR . DIRECTORY_SEPARATOR . 'divisions.csv', 'r');
fputcsv($namesCsv, $namesHeader, CSV_DELIMITER);
// Skip headers
fgetcsv($divisionsCsv, 1024, CSV_DELIMITER);
while ($division = fgetcsv($divisionsCsv, 1024, CSV_DELIMITER)) {
$countryCode = $division[0];
$divisionCode = $division[1];
$url = $division[4];
if ($url && strpos($url, 'redlink=1') === false) {
/**
* @var simple_html_dom_node $html
* @var simple_html_dom_node $table
* @var simple_html_dom_node $titleNode
*/
echo $countryCode . '-' . $divisionCode . ': ' . $url . PHP_EOL;
$html = file_get_html($url, null, null, null);
if ($names = GetNames($html)) {
foreach ($names as $langCode => $langData) {
fputcsv($namesCsv, [$countryCode, $divisionCode, $langCode, $langData['name'], $langData['url']], CSV_DELIMITER);
}
}
}
}
fclose($namesCsv);
fclose($divisionsCsv);
/**
* @param simple_html_dom_node $node
* @return null|array
*/
function GetNames($node) {
/**
* @var simple_html_dom_node $block
* @var simple_html_dom_node $links
*/
if ($block = $node->find('div[id=p-lang]', 0)) {
if ($links = $block->find('a[class=interlanguage-link-target]')) {
$list = [];
foreach ($links as $link) {
$langCode = $link->lang ?: $link->hreflang;
list($name) = explode(' — ', $link->title, 2);
list($name) = explode(' – ', $name, 2);
list($name) = explode(',', $name, 2);
$name = preg_replace('~\([^)]*\)~S', '', $name);
$list[$langCode] = [
'name' => trim($name),
'url' => $link->href
];
}
return $list;
}
}
return null;
}