Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add demo for selenium crawler with cookie #107

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Annotation versions are named with `DemoAnnotatedxxxxxx.java`.
+ [DemoPostCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoPostCrawler.java)
+ [DemoRandomProxyCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoRandomProxyCrawler.java)
+ [AbuyunDynamicProxyRequester.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/AbuyunDynamicProxyRequester.java)
+ [DemoSeleniumCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoSeleniumCrawler.java)
+ [DemoSeleniumCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoSeleniumCrawler.java) | [DemoSeleniumWithCookieCrawler.java](src/main/java/cn/edu/hfut/dmic/webcollector/example/DemoSeleniumWithCookieCrawler.java)

### NextFilter

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package cn.edu.hfut.dmic.webcollector.example;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import okhttp3.Request;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;

import java.util.List;

/**
* 使用WebCollector自定义HTTP请求,并抓取JavaScript生成的数据
* 由DemoCookieCrawler和DemoSeleniumCrawler的内容结合起来实现
*
* @author smallyu
*/
public class DemoSeleniumWithCookieCrawler extends BreadthCrawler {

// 自定义的请求插件
// 可以自定义User-Agent和Cookie
public static class MyRequester extends OkHttpRequester {

String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
String cookie = "name=abcdef";

// 每次发送请求前都会执行这个方法来构建请求
@Override
public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
// 这里使用的是OkHttp中的Request.Builder
// 可以参考OkHttp的文档来修改请求头
System.out.println("request with cookie: " + cookie);
return super.createRequestBuilder(crawlDatum)
.header("User-Agent", userAgent)
.header("Cookie", cookie);
}
}

public DemoSeleniumWithCookieCrawler(String crawlPath) {
super(crawlPath, true);

// 设置请求插件
setRequester(new MyRequester());

addSeed("https://www.sogou.com/web?query=%E6%B7%98%E5%AE%9D");
}

// BreadthCrawler继承自AutoParseCrawler,而AutoParseCrawler本身就是一个Executor
@Override
public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
super.execute(datum, next);

// 来自 DemoSeleniumCrawler 的示例代码
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);

driver.get(datum.url());

List<WebElement> elementList = driver.findElementsByCssSelector("h3.vrTitle a");
for(WebElement element:elementList){
System.out.println("title:"+element.getText());
}
}

// 指定Executor就不需要visit来处理内容了
public void visit(Page page, CrawlDatums crawlDatums) {}

public static void main(String[] args) throws Exception {
DemoSeleniumWithCookieCrawler crawler = new DemoSeleniumWithCookieCrawler("crawl");
crawler.start(2);
}
}