-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlarge_xml_to_conllu3.rb
69 lines (64 loc) · 1.93 KB
/
large_xml_to_conllu3.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
forumname = ARGV[0]
if ARGV[1].nil?
STDERR.puts "You didn't specify the type: post or sentence!"
exit
end
f = File.open("#{ARGV[0]}.xml","r:utf-8")
unitlist = ["paragraph", "text", "thread", "forum", "corpus"]
shortunitlist = ["paragraph", "text", "thread"]
open = Hash.new("")
nsents = 0
filecounter = 1
o = File.open("#{ARGV[0]}#{filecounter}.xml","w:utf-8")
f.each_line do |line|
unitlist.each do |unit|
if line.strip.split(" ")[0] == "<#{unit}"
open[unit] = line
elsif line.strip == "</#{unit}>"
open[unit] = ""
end
end
if line.strip == "</sentence>"
nsents += 1
end
o.puts line.strip
if nsents > 100000 and (open["paragraph"] == "" and open["text"] == "" and open["thread"] == "")
unitlist.each do |unit|
if open[unit] != ""
o.puts "</#{unit}>"
end
end
o.close
system("ruby xml_to_conllu3.rb #{ARGV[0]}#{filecounter}.xml #{ARGV[1]}")
File.delete("#{ARGV[0]}#{filecounter}.xml")
nsents = 0
#open = {}
filecounter += 1
o = File.open("#{ARGV[0]}#{filecounter}.xml","w:utf-8")
["corpus", "forum"].each do |unit|
if open[unit] != ""
o.puts open[unit]
end
end
end
end
o.close
system("ruby xml_to_conllu3.rb #{ARGV[0]}#{filecounter}.xml #{ARGV[1]}")
File.delete("#{ARGV[0]}#{filecounter}.xml")
#for i in 1..filecounter ###
# STDERR.puts i
# system("ruby xml_to_conllu_sm.rb #{ARGV[0]}#{i}.xml #{ARGV[1]}")
# File.delete("#{ARGV[0]}#{i}.xml")
#end
STDERR.puts "Creating large conllu..."
outfile = File.open("#{ARGV[0]}_#{ARGV[1]}.conllu","w:utf-8")
for i in 1..filecounter ###
STDERR.puts i
infile = File.open("#{ARGV[0]}#{i}.conllu","r:utf-8")
infile.each_line do |line|
outfile.puts line
end
infile.close
File.delete("#{ARGV[0]}#{i}.conllu")
end
outfile.close