-
Notifications
You must be signed in to change notification settings - Fork 14
/
update_urls.rb
140 lines (108 loc) · 3.48 KB
/
update_urls.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# encoding: UTF-8
# script that goes through all PDFs in the BibDesk PDF folder, checks if it has a Finder download URL,
# checks if that URL is OA, and if it is, adds the field OA-URL and URL to the bibtex entry
$:.push(File.dirname($0))
require 'utility-functions'
require 'appscript'
require 'cgi'
require 'net/http'
require 'open-uri'
# this script goes through all the BibDesk PDFs, extracts download URLs, and adds these fields to BibDesk
BibDesk = Appscript.app('BibDesk')
def is_url(url)
return true if url.index("http")
end
def update_url(pub, url)
pub.fields["Url"].value.set(url)
pub.fields["OA-URL"].value.set(url)
puts "*" * 78
puts "OA! #{url}"
puts "*" * 78
end
def checkOA(url)
return false unless is_url(url)
puts "Checking OA: #{url}"
res = checkOArun(url)
puts res ? "True" : "False"
return res
end
def checkOArun(origurl)
url = origurl.gsub(/https?\:\/\/?/,'')
uri, *path = url.split("/")
path = "/" + path.join("/")
origurl.sub!(':/', '://') unless origurl.index("//")
chrome_agent = 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19'
curl_opts = "--connect-timeout 5 -A '#{chrome_agent}'"
# first check against whitelist
whitelist = [ # list of URLs that don't need to be downloaded to check, first is URI, second is path
[/arxiv\.org/, /\.pdf$/]
]
whitelist.each { |comp| return true if uri.index(comp[0]) && path.index(comp[1]) }
# faking agent, to avoid no-robots
# grab header using curl
response = `curl #{curl_opts} -I '#{origurl}'`
possible_ctypes = [
"application/pdf",
"application/x-pdf",
"application/vnd.pdf",
"application/text.pdf"]
# if ctype matches PDF, true, otherwise explore further
possible_ctypes.each {|ctype| return true if response.index("Content-Type: #{ctype}")}
# try curl
`curl #{curl_opts} -r 0-99 -s '#{origurl}' > output.tmp`
return (`file output.tmp;rm output.tmp`.index("PDF document") ? true : false)
# we tried, but we failed.
return false
end
if __FILE__==$0
t = Time.now
puts "Updating URLs on all files in #{PDF_path}"
# logfiles
dontmatch = File.open('dontmatch.txt','w')
nourl = File.open('noturl.txt','w')
notoa = File.open('notoa.txt','w')
oa = File.open('oa.txt','w')
# iterate through Bibdesk PDF directory
Dir.foreach(PDF_path) do |f|
next if f == '.' or f == '..'
next unless f.size < 4 || f[-4..-1].downcase == '.pdf'
docu = f[0..-5]
p docu
pub = BibDesk.document.search({:for =>docu})
# PDF name doesn't match any citekeys
unless pub.class == Array && pub.size > 0
puts "#{docu}: Doesn't match citekey"
dontmatch << docu << "\n"
next
end
# already has OA pub
if pub[0].fields["OA-Url"].value.get.size > 0
puts "#{docu}: Already OA"
next
end
# if already has URL field, check if OA
url = try { pub[0].fields["URL"].value.get }
if is_url(url) && checkOA(url)
update_url(pub[0], url)
puts "#{docu}: URL OA"
oa << docu << "\n"
next
end
# try to get d/l URL from Finder metadata
a = `mdls -name kMDItemWhereFroms "#{PDF_path}/#{docu}.pdf"`
if url = try {a.split('"')[1]} && checkOA(url)
update_url(pub[0], url)
puts "#{docu}: Finder OA"
oa << docu << "\n"
next
end
if url.class == String
puts "#{docu}: No OA"
notoa << docu << "\n"
else
puts "#{docu}: No file"
nourl << docu << "\n"
end
end
puts Time.now-t
end