-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch-engine.rb
72 lines (63 loc) · 1.83 KB
/
search-engine.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#coding:utf-8
require 'json'
require_relative 'scrypton'
# 引擎查询: 关键字查询
module Scrypton
module_function
def scjg_articles all=100
articles, index = [], 0
all.times.each do
(1..10).each do|id|
begin
alink = @browser.div(class: 'result c-container xpath-log new-pmd', id: "#{index*10+id}").a(tabindex: '0')
articles << alink.href
rescue Watir::Exception::UnknownObjectException => out_of_item
# puts out_of_item.message
end
end
index += 1
begin
next_page = @browser.div(class: 'page-inner_2jZi2').a(text: "#{index+1}")
next_page.click
# Scrypton.default_timeout
rescue Watir::Exception::UnknownObjectException => out_of_page
# puts out_of_page.message
break
end
end
return articles
end
def scjg_refine article
begin
@browser.goto article
rescue Net::ReadTimeout => out_of_read
sleep 3
end
sleep 1
begin
content = @browser.div(class: 'r-rest').text
rescue Watir::Exception::UnknownObjectException => out_of_item
# puts out_of_item.message
content = ''
end
# puts [@browser.url, @browser.title, content]
return "[#{@browser.title}](#{@browser.url})\n\n#{content}"
end
end
Scrypton.quick_timeout
Scrypton.browser
Dir.mkdir 'doc' unless File.exist? 'doc'
begin # 查询
keyword = DATA.read.gsub("\r","").gsub("\n","")
Scrypton.query_baidu keyword
articles = Scrypton.scjg_articles
File.write "doc/scjg-articles.json", JSON.pretty_generate(articles)
end
begin # 摘要
articles = JSON.parse File.read "doc/scjg-articles.json"
digests = articles.inject([]){|digests,article|digests << (Scrypton.scjg_refine article)}
File.write "doc/digest.md", digests.join("\n\n---\n\n")
end
Scrypton.close
__END__
KEYWORD site:DOMAIN