-
-
Notifications
You must be signed in to change notification settings - Fork 442
API QueryList
Jaeger edited this page Dec 15, 2017
·
9 revisions
-
QL\QueryList
-
Methods
- __construct
- getInstance * Examples
- config * Examples
- destruct * Examples
- bind * Arguments * Examples
- getHtml * Examples
- setHtml * Arguments * Examples
- html * Arguments * Examples
- find * Arguments * Examples
- rules * Arguments * Examples
- range * Arguments * Examples
- removeHead * Examples
- query * Arguments * Examples
- getData * Arguments * Examples
- setData * Arguments * Examples
- encoding * Examples
- get * Arguments * Examples
- post * Arguments * Examples
- use * Arguments * Examples
-
Methods
Class QueryList
- Class name: QueryList
- Namespace: QL
Most of the methods in the QueryList can support both static and dynamic calls:
QueryList::html($html)->rules($rules)->range($range)->query();
QueryList::rules($rules)->html($html)->range($range)->query();
QueryList::range($range)->rules($rules)->html($html)->query();
mixed QL\QueryList::__construct()
QueryList constructor.
- Visibility: public
\QL\QueryList QL\QueryList::getInstance()
Get the QueryList single instance
- Visibility: public
- This method is static.
$ql = QueryList::getInstance();
$data = $ql->get('http://www.baidu.com/s?wd=QueryList')->find('h3 a')->texts();
print_r($data->all());
null|\QL\Config QL\QueryList::config()
Get the Config instance
- Visibility: public
- This method is static.
QueryList::config()->use(My\MyPlugin::class,$arg1,$arg2,$arg3);
mixed QL\QueryList::destruct()
Destruction of resources.Free up phpQuery Document memory.
- Visibility: public
$html = file_get_contents('https://querylist.cc/');
$ql = QueryList::html($html);
$ql->destruct();
// Exception
$ql->find('a');
// Need to re-set html
$ql->html($html);
// ok
$ql->find('a');
// Collection of links to be crawled
$urls = [
'https://querylist.cc/1.html',
'https://querylist.cc/2.html',
'https://querylist.cc/3.html',
//...
];
// Set crawl rules
$ql = QueryList::rule([
'title' => ['h1','text'],
'link' => ['a','href']
]);
foreach($urls as $url){
// Each link will be applied to the above set a good crawl rules
$data = $ql->get($url)->query()->getData();
// Release the Document memory footprint
$ql->destruct();
// ...
}
\QL\QueryList QL\QueryList::bind(string $name, \Closure $provide)
Bind a custom method to the QueryList object
- Visibility: public
- $name string -Invoking the name
- $provide Closure -Called method
- Example-1
$ql = QueryList::getInstance();
//Register a myHttp method into the QueryList object
$ql->bind('myHttp',function ($url){
$html = file_get_contents($url);
$this->setHtml($html);
return $this;
});
//And then can be registered by the name of the call
$data = $ql->myHttp('https://toutiao.io')->find('h3 a')->texts();
print_r($data->all());
//Or so use
$data = $ql->rules([
'title' => ['h3 a','text'],
'link' => ['h3 a','href']
])->myHttp('https://toutiao.io')->query()->getData();
print_r($data->all());
- Example-2
//Extend a picture download function
//param:$path Save the image locally for the image
$ql = QueryList::bind('downloadImage',function ($path){
$data = $this->getData()->map(function ($item) use($path){
//Get the picture
$img = file_get_contents($item['image']);
$localPath = $path.'/'.md5($img).'.jpg';
//Save the image to the local path
file_put_contents($localPath,$img);
//Add a custom local path field to the data array
$item['local_path'] = $localPath;
return $item;
});
//Update the data property
$this->setData($data);
return $this;
});
$data = $ql->get('http://desk.zol.com.cn')->rules([
'image' => ['#newPicList img','src']
])->query()->downloadImage('img')->getData();
print_r($data->all());
- Example-3
QueryList::bind('myHttp',function(){
return new MyHttp($this);
})
- Example-4
$ql = QueryList::bind('myHttp',function(){
return new MyHttp($this);
});
$ql->bind('other',function(){
//Use the previous bind
$this->myHttp();
return $this;
})
QL\QueryList::getHtml()
- Visibility: public
$html = <<<STR
<div class="two">
<a href="http://querylist.cc">QueryList官网</a>
<img src="http://querylist.com/1.jpg" alt="这是图片">
<img src="http://querylist.com/2.jpg" alt="这是图片2">
</div>
STR;
$ql = QueryList::html($html);
$html = $ql->getHtml();
echo $html;
\QL\QueryList QL\Dom\Query::setHtml($html, null $charset)
- Visibility: public
- $html mixed
- $charset null
$ql = QueryList::setHtml($html);
$ql = QueryList::setHtml($html,'UTF-8');
QL\QueryList::html($html, null $charset)
Set html,see setHtml()
.
- Visibility: public
- $html mixed
- $charset null
$ql = QueryList::html($html);
$ql = QueryList::html($html,'UTF-8');
\QL\Dom\Elements QL\Dom\Query::find($selector)
Searches for all elements that match the specified expression.
- Visibility: public
- $selector mixed - A string containing a selector expression to match elements against.
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList');
//Get the text of the a tag under all h3 tags
$data = $ql->find('h3>a')->texts();
print_r($data->all());
//Get all the image addresses in the page
$data = $ql->find('img')->attrs('src');
print_r($data->all());
\QL\QueryList QL\Dom\Query::rules(array $rules)
Set crawl rule
$rules = [
'rule_name1' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
'rule_name2' => ['selector','HTML attribute | text | html','Tag filter list','callback'],
// ...
]
- Visibility: public
- $rules array
$html=<<<STR
<div class="content">
<div>
<a href="https://querylist.cc/1.html">这是链接一</a>
<span>这是文字一</span>
</div>
<div>
<a href="https://querylist.cc/2.html">这是链接二</a>
<span>这是文字二</span>
</div>
<div>
<a href="https://querylist.cc/1.html">这是链接三</a>
<span>这是<b>文字</b>三</span>
</div>
</div>
STR;
//Crawl rules
$rules = [
//Gets the href attribute of a tag
'link' => ['a','href'],
//Gets the text text of a tag
'link_text' => ['a','text'],
//Gets the html text of the span tag
'txt' => ['span','html']
];
$ql = QueryList::html($html)->rules($rules)->query();
$data = $ql->getData();
print_r($data->all());
\QL\QueryList QL\Dom\Query::range($selector)
Set the slice area for crawl list
- Visibility: public
- $selector mixed
$html =<<<STR
<div id="main">
<ul>
<li>
<h1>这是标题1</h1>
</li>
<li>
<h1>这是标题2</h1>
<span>这是文字2<span>
</li>
</ul>
</div>
STR;
//Not recommended
$data = QueryList::html($html)->rules([
'title' => array('#main>ul>li>h1','text'),
'content' => array('#main>ul>li>span','text')
])->query()->getData();
print_r($data->all());
//Set the slice area
$data = QueryList::html($html)->rules([
'list' => array('h1','text'),
'content' => array('span','text')
])->range('#main>ul>li')->query()->getData();
print_r($data->all());
\QL\QueryList QL\Dom\Query::removeHead()
Remove HTML head,try to solve the garbled
- Visibility: public
$html = file_get_contents('http://www.baidu.com/s?wd=QueryList');
$ql = QueryList::rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
]);
$data = $ql->setHtml($html)->removeHead()->query()->getData();
print_r($data);
\QL\QueryList QL\Dom\Query::query(\Closure|null $callback)
Execute the query rule
- Visibility: public
- $callback Closure|null
$ql = QueryList::get('http://www.baidu.com/s?wd=QueryList')->rules([
'title'=>array('h3','text'),
'link'=>array('h3>a','href')
]);
$data = $ql->query(function($item){
$item['title'] = $item['title'].' - other string...';
return $item;
})->getData();
print_r($data->all());
\Illuminate\Support\Collection|static QL\Dom\Query::getData(\Closure|null $callback)
Get crawl results
- Visibility: public
- $callback Closure|null
$html =<<<STR
<div class="xx">
<img data-src="/path/to/1.jpg" alt="">
</div>
<div class="xx">
<img data-src="/path/to/2.jpg" alt="">
</div>
<div class="xx">
<img data-src="/path/to/3.jpg" alt="">
</div>
STR;
$baseUrl = 'http://xxxx.com';
$data = QueryList::html($html)->rules(array(
'image' => array('.xx>img','data-src')
))->query()->getData(function($item) use($baseUrl){
return $baseUrl.$item['image'];
});
print_r($data->all());
mixed QL\Dom\Query::setData(\Illuminate\Support\Collection $data)
- Visibility: public
- $data Illuminate\Support\Collection
$ql = QueryList::getInstance();
$ql->setData(collect([
'txt' => 'Custom content'
]));
$data = $ql->getData();
print_r($data->all());
QL\QueryList::encoding(string $outputEncoding,string $inputEncoding = null)
Encoding of html to solve the problem of distortion.
- Visibility: public
$data = QueryList::get('https://top.etao.com')->encoding('UTF-8')->find('a')->texts();
print_r($data);
$data = QueryList::rules([
'txt' => ['a','text']
])->get('https://top.etao.com')->encoding('UTF-8','GB2312')->query()->getData();
print_r($data);
QL\QueryList::get($url,$args = null,$otherArgs = [])
Http get.see GuzzleHttp
- Visibility: public
- $url string
- $args array|string url params
- $otherArgs array GuzzleHttp options
$ql = QueryList::get('http://httpbin.org/get?param1=testvalue');
echo $ql->getHtml();
$ql->get('http://httpbin.org/get',[
'param1' => 'testvalue',
'params2' => 'somevalue'
]);
$ql->get('http://httpbin.org/get','param1=testvalue& params2=somevalue');
echo $ql->getHtml();
$ql = QueryList::get('http://weibo.com',[],[
'headers' => [
//Fill in the cookie from the browser
'Cookie' => 'SINAGLOBAL=546064; wb_cmtLike_2112031=1; wvr=6;....'
]
]);
//echo $ql->getHtml();
echo $ql->find('title')->text();
$cookieJar = new \GuzzleHttp\Cookie\CookieJar();
$ql = QueryList::get('https://www.baidu.com/',[],[
'cookies' => $cookieJar
]);
$ql->get('http://httpbin.org/get',[
'param1' => 'testvalue',
'params2' => 'somevalue'
],[
'proxy' => 'http://222.141.11.17:8118',
//Set the timeout time in seconds
'timeout' => 30,
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
]);
echo $ql->getHtml();
QL\QueryList::post()
Http post.see GuzzleHttp
- Visibility: public
- $url string
- $args array post data
- $otherArgs array GuzzleHttp options
$ql->post('http://httpbin.org/post',[
'param1' => 'testvalue',
'params2' => 'somevalue'
],[
'proxy' => 'http://222.141.11.17:8118',
'timeout' => 30,
'headers' => [
'Referer' => 'https://querylist.cc/',
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
'X-Foo' => ['Bar', 'Baz'],
'Cookie' => 'abc=111;xxx=222'
]
]);
echo $ql->getHtml();
$ql = QueryList::post('http://xxxx.com/login',[
'username' => 'admin',
'password' => '123456'
])->get('http://xxx.com/admin');
$ql->get('http://xxx.com/admin/page');
//echo $ql->getHtml();
Login to GitHub:
$ql = QueryList::getInstance();
//Set the cookie manually
$jar = new \GuzzleHttp\Cookie\CookieJar();
//Get the login form
$form = $ql->get('https://github.com/login',[],[
'cookies' => $jar
])->find('form');
//Fill in the GitHub username and password
$form->find('input[name=login]')->val('your github username or email');
$form->find('input[name=password]')->val('your github password');
//Serialize the form data
$fromData = $form->serializeArray();
$postData = [];
foreach ($fromData as $item) {
$postData[$item['name']] = $item['value'];
}
//Submit the login form
$actionUrl = 'https://github.com'.$form->attr('action');
$ql->post($actionUrl,$postData,[
'cookies' => $jar
]);
//To determine whether the login is successful
// echo $ql->getHtml();
$userName = $ql->find('.header-nav-current-user>.css-truncate-target')->text();
if($userName)
{
echo 'Login Success! Welcome:'.$userName;
}else{
echo 'Login failed!';
}
QL\QueryList::use($plugins,…$opt)
Install the plugin.
- Visibility: public
- $plugins string|array Plug-in class name, or it can be an array of plug-in class names
- …$opt mixed Multiple parameters that come with the plug-in.
$ql = QueryList::getInstance();
$ql->use(My\MyPlugin::class);
//Or, with installation parameters
$ql->use(My\MyPlugin::class,$arg1,$arg2,$arg3);
$ql = QueryList::getInstance();
$ql->use([
My\MyPlugin::class,
My\MyPlugin2::class,
Other\OtherPlugin::class
]);