-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdd_compare.rb
126 lines (115 loc) · 6.13 KB
/
dd_compare.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
require 'pathname'
require 'fileutils'
require 'ostruct'
require 'optparse'
require 'thwait'
require 'logger'
#following checks the postgres driver is installed and installs it if not. It assumes that there's only one Ruby installation on the machine
#If there are more, it might try to install the gem to the wrong ruby version which will cause the code to fail
#Note also that this script is only compatible with pg versions upto and including 0.20. Later version
#seems to have syntax changes which cause things to fail.
begin
require 'pg'
rescue LoadError
system('gem install pg')
require 'pg'
end
#this function loads each set of dd files to its respective temp table on the database
#includes the source directory name & source file name and line no.
def load_dds(dir_path, pg_connection, tablename)
Dir.glob(dir_path + '/*.dd') do |dd_file|
#i.e. for each dd file... Open it
fin = File.new(dd_file, "r")
i=0
#Get just the directory name, not the path to it.
dirname = File.dirname(fin).gsub(/^(.+)\//,"")
#Get rid of at least _some_ characters which are not allowed in postgres field headings
dirname=dirname.gsub(/[-\.]/,"_")
#Get just the file name, not the path to id
#NB File assumes unix path separators (='/', hence need for simple gsub, or use require 'pathname'...)
fn=dd_file.gsub(/^(.+)\//,"")
while (line = fin.gets)
i+=1
line=line.chomp
#This splits out the non-numerical part of the entry
entry=line.gsub(/(.+) [0-9\.(E\-)]+$/,'\1')
# This splits out the numerical entry if any (field prepopulated with zero)
entry_val=line.gsub(/.+ ([0-9\.(E\-)]+)$/,'\1')
#chomp removes newline characters which gets adds.
res = pg_connection.exec('insert into '+tablename+' select $1,$2,$3,$4,$5',[dirname,fn,i.to_s,entry,entry_val.to_f])
end
end
end
#The following needs to match your postgres configuration... database name can be any
conn = PGconn.connect( "host=localhost port=5432 dbname=postgres user=postgres")
#First argument to the script is the "old" dd files, 2nd is the "new" dd files. These must not have spaces in them
oldpath = ARGV[0].gsub("\\","/")
newpath = ARGV[1].gsub("\\","/")
#Create the temp tables on postgres
conn.exec('drop table IF exists regold')
conn.exec('create temp table regold(sourcef varchar, filename varchar,lineno varchar,entry varchar,entry_val numeric default 0.00)')
conn.exec('drop table IF exists regnew')
conn.exec('create temp table regnew(sourcef varchar, filename varchar,lineno varchar,entry varchar, entry_val numeric default 0.00)')
#Load the dds using the above function
load_dds(oldpath,conn, "regold")
load_dds(newpath,conn,"regnew")
#The 3rd argument to the script is optional. It's the directory path where you want the output CSVs to go. If it's
#not included, CSVs will be sent to directory with the "new" dds in it.
if ARGV.length==3
outdir=ARGV[2].gsub("\\","/")
else
outdir=newpath
end
#Generate the summary statistics about no. of entries the same, no. different.
s= "select * into temp basic_counts from
(
select a.filename filename, 'same' as entry, count(*) as counts from regold a inner join regnew b on a.entry=b.entry and round(a.entry_val,5)=round(b.entry_val,5) group by a.filename union
select a.filename, 'only_$1', count(*) from regold a left join regnew b on a.filename=b.filename and a.entry=b.entry and round(a.entry_val,5)=round(b.entry_val,5)
where b.filename is null group by a.filename union
select b.filename, 'only_$2', count(*) from regold a right join regnew b on a.filename=b.filename and a.entry=b.entry and round(a.entry_val,5)=round(b.entry_val,5)
where a.filename is null group by b.filename
) c;
select * into temp summ_ind from
(
select filename
,sum(case when entry='same' then counts else 0 end) same
,sum(case when entry='only_$1' then counts else 0 end) only_$1
,sum(case when entry='only_$2' then counts else 0 end) only_$2
,round(sum(case when entry='same' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) same_percent
,round(sum(case when entry='only_$1' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) only_$1_percent
,round(sum(case when entry='only_$2' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) only_$2_percent
from basic_counts
group by filename
order by same_percent
) a;
insert into summ_ind
select 'all dds'
,sum(case when entry='same' then counts else 0 end) same
,sum(case when entry='only_$1' then counts else 0 end) only_$1
,sum(case when entry='only_$2' then counts else 0 end) only_$2
,round(sum(case when entry='same' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) same_percent
,round(sum(case when entry='only_$1' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) only_$1_percent
,round(sum(case when entry='only_$2' then counts else 0 end)::numeric/sum(counts)::numeric*100.0,1) only_$2_percent
from basic_counts;
"
#Bit of a hack to substitute the actual directory names into the SQL. Use of $1, $2 above just gives something unique to search and replace
oldpath=oldpath.gsub(/[-\.]/,"_")
s=s.gsub('$1',oldpath.gsub(/^(.+)\//,""))
newpath=newpath.gsub(/[-\.]/,"_")
s=s.gsub('$2',newpath.gsub(/^(.+)\//,""))
conn.exec(s)
#Actually run the comparison and generate the summary counts of same and different
s="
Copy (
select * from summ_ind
) TO '"+outdir+"/summaryCnts.csv' delimiter ',' CSV HEADER;"
conn.exec(s)
#Generate the list of differences between the files
s="
copy (
select a.sourcef,a.filename,a.lineno,a.entry,a.entry_val from regold a left join regnew b on a.filename=b.filename and a.entry=b.entry and round(a.entry_val,5)=round(b.entry_val,5) where b.entry is null
union
select b.sourcef,b.filename,b.lineno,b.entry,b.entry_val from regold a right join regnew b on a.filename=b.filename and a.entry=b.entry and round(a.entry_val,5)=round(b.entry_val,5) where a.entry is null
order by filename,lineno,sourcef
) TO '"+outdir+"/diffList.csv' delimiter ',' CSV HEADER;"
conn.exec(s)