-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetchheise-fetcharticles
executable file
·65 lines (52 loc) · 2.13 KB
/
fetchheise-fetcharticles
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env ruby
# in: URLs of article pages on stdin, one per line
# out: those articles, including subject, time, author, msgid, and body,
# as plain text, encoded in ISO latin1
require 'nokogiri'
class String
def to_text(indent, wordwrap)
lines = self.split("\n", -1)
lines = [""] if lines==[] # should never happen but does...
lines[0] + "\n" + lines[(1..-1)].map{|line| indent+line }.join("\n")
end
end
class Nokogiri::XML::Node
def first_node(elt_type, css_class)
self.css(elt_type).find{|a| a['class'] && a['class'].split.include?(css_class)}
end
def to_bbtext(indent='', wordwrap=true)
if Nokogiri::XML::Text === self
return self.text.strip.to_text(indent, wordwrap)
end
case self.name
when 'br'
"\n#{indent}"
when 'p'
"\n#{indent}" + self.children.map{|c| c.to_bbtext(indent, wordwrap)}.join("") + "\n#{indent}"
when 'blockquote'
"\n#{indent}" + self.children.map{|c| c.to_bbtext("#{indent}> ", wordwrap)}.join("") + "\n#{indent}"
when 'code'
"\n#{indent}" + self.children.map{|c| c.to_bbtext("#{indent}| ", wordwrap)}.join("") + "\n#{indent}"
when 'img'
self.attr('alt') || '<image>'
else
"\n#{indent}" + self.children.map{|c| c.to_bbtext(indent, wordwrap)}.join("") + "\n#{indent}"
end
end
end
STDIN.each_line do |url|
#page = Nokogiri::HTML(IO.popen("wget-withbrowsersettings -q -O- #{url}"))
page = Nokogiri::HTML(File.open("fetchheise-fetcharticles.html"))
time_node = page.first_node('time', 'posting_timestamp')
auth_node = page.first_node('span', 'full_user_string')
subj_node = page.first_node('h1', 'thread_title')
post_node = page.first_node('div', 'post').first_node('div', 'bbcode_v1')
puts "subject: #{subj_node.text.strip}"
puts "time: #{time_node.attr('title')}"
puts "author: #{auth_node.text.strip}"
puts "msgid: TODO"
puts "url: #{url}"
puts "\nbody:\n#{post_node.to_bbtext('i')}"
end
# TODO: maybe just let lynx/links render the body instead?
# http://www.heise.de/forum/heise-Developer/News-Kommentare/Vor-10-Jahren-Linus-Torvalds-baut-Git/Re-Schon-nach-wenigen-Tagen/posting-2265346/show/