-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.php
40 lines (35 loc) · 1.34 KB
/
scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
<?php
require 'scraperwiki.php';
/**************************************
* Basic PHP scraper
**************************************/
$html = scraperwiki::scrape("http://www.imdb.com/chart/top");
$html = oneline($html);
preg_match_all('|<tr bgcolor="#.*?" valign="top"><td align="right"><font face="Arial, Helvetica, sans-serif" size="-1"><b>(.*?)\.</b></font></td><td align="center"><font face="Arial, Helvetica, sans-serif" size="-1">(.*?)</font></td><td><font face="Arial, Helvetica, sans-serif" size="-1"><a href="(.*?)">(.*?)</a> \((.*?)\)</font></td><td align="right"><font face="Arial, Helvetica, sans-serif" size="-1">.*?</font></td></tr>|', $html, $arr);
foreach ($arr[1] as $key => $val) {
scraperwiki::save([
'rank'
], [
'rank' => "" . clean($arr[1][$key]),
'rating' => clean($arr[2][$key]),
'name' => clean($arr[4][$key]),
'year' => clean($arr[5][$key]),
'link' => clean('http://www.imdb.com' . $arr[3][$key])
]);
}
function clean($val)
{
$val = str_replace(' ', ' ', $val);
$val = str_replace('&', '&', $val);
$val = html_entity_decode($val);
$val = strip_tags($val);
$val = trim($val);
$val = utf8_decode($val);
return $val;
}
function oneline($code)
{
$code = str_replace("\n", '', $code);
$code = str_replace("\r", '', $code);
return $code;
}