This repository has been archived by the owner on Jul 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgoogle-scholar-scraper.rb
executable file
·79 lines (67 loc) · 2.53 KB
/
google-scholar-scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env ruby
# encoding: utf-8
##################################################################################
# Google Scholar search scraper: lists main URLs given a phrase query and year #
# #
# Example: ./google-scholar-scraper.rb \ #
# "U.S. Government Work Not Protected by U.S. Copyright" 1975 #
# #
# CC-0, ArchiveTeam/WikiTeam, 2018 #
# #
##################################################################################
require 'mechanize'
require 'uri'
urls = Array.new
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Linux Firefox' # e.g. with Konqueror the HTML is not ok for //h3/a search
}
prng = Random.new
search_result = a.get('https://scholar.google.it' )
search_form = search_result.forms.first
search_form.as_q = '"%s"' % ARGV[0]
search_form.as_ylo = ARGV[1]
search_form.as_yhi = ARGV[1]
search_result = a.submit(search_form, search_form.buttons.first)
# Continue clicking "Next" endlessly; exits at some point.
loop do
search_result.search("//h3/a").each do |link|
# The result URLs are in h3 headers and possibly passed through google.com/url?q=
target = link['href']
unless target.nil?
# Take each result URI provided
begin
uri = URI.parse(target)
# puts "Found URI: %s" % target
rescue URI::InvalidURIError
puts "Skipped invalid URI: %s" % target
break
end
unless urls.include?(target)
urls << target
print '.'
end
end
end
sleep(prng.rand(10..180.0))
begin
# We click the "Next" link; replace with your language
search_result = search_result.link_with(:text => 'Avanti').click
rescue NoMethodError
begin
# Use the name of the link to repeat research with previously removed results. Doesn't work without leading space, in Italian!
search_result = search_result.link_with(:text => ' ripetere la ricerca includendo i risultati omessi').click
rescue NoMethodError
break
end
rescue Net::HTTPServiceUnavailable
puts "We got a 503, party is over"
break
end
break if search_result.nil?
end
# Print all URLs found, one per line, to a file existing or new
output = File.open( "GS-PD-Gov.txt", "a")
urls.each do |url|
output.puts url
end
output.close