-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_all_transform_stats.sh
executable file
·112 lines (96 loc) · 2.92 KB
/
get_all_transform_stats.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
# Get all transformation stats for BioPortal ontologies
#
TX_PATH="./transformed/ontologies/"
# KGX validation error types
all_errortypes=("MISSING_NODE_PROPERTY"
"MISSING_EDGE_PROPERTY"
"INVALID_NODE_PROPERTY"
"INVALID_EDGE_PROPERTY"
"INVALID_NODE_PROPERTY_VALUE_TYPE"
"INVALID_NODE_PROPERTY_VALUE"
"INVALID_EDGE_PROPERTY_VALUE_TYPE"
"INVALID_EDGE_PROPERTY_VALUE"
"MISSING_CATEGORY"
"INVALID_CATEGORY"
"Category 'OntologyClass' is a mixin in the Biolink Model"
"MISSING_EDGE_PREDICATE"
"INVALID_EDGE_PREDICATE"
"MISSING_NODE_CURIE_PREFIX"
"DUPLICATE_NODE"
"MISSING_NODE"
"INVALID_EDGE_TRIPLE"
"VALIDATION_SYSTEM_ERROR"
)
all_nodetypes=("biolink:NamedThing"
"biolink:OntologyClass"
"biolink:BiologicalProcess"
"biolink:Cell"
"biolink:CellularComponent"
"biolink:ChemicalSubstance"
"biolink:Disease"
"biolink:Event"
"biolink:ExposureEvent"
"biolink:Gene"
"biolink:MolecularActivity"
"biolink:NamedThing"
"biolink:OntologyClass"
"biolink:OrganismalEntity"
"biolink:Pathway"
"biolink:PhenotypicFeature"
"biolink:Protein"
"biolink:SequenceFeature"
"biolink:SexQualifier"
"biolink:Source"
"biolink:TaxonomicRank"
"biolink:Unit"
"biolink:AnatomicalEntity"
)
all_edgetypes=("biolink:related_to"
"biolink:subclass_of"
"biolink:part_of"
"biolink:inverseOf"
"biolink:subPropertyOf"
"biolink:has_part"
"biolink:has_participant"
"biolink:has_unit"
"biolink:preceded_by"
"biolink:has_attribute"
"biolink:positively_regulates"
"biolink:negatively_regulates"
)
# Run
echo "*** General ontology counts:"
printf "%10s\t" "All processed ontologies:"
ls -d $TX_PATH* | wc -l
printf "%10s\t" "All successful JSON transforms:"
find $TX_PATH -name "*.json" | wc -l
printf "%10s\t" "All successful KGX TSV transforms:"
find $TX_PATH -name "*_edges.tsv" | wc -l
printf "%10s\t" "All transforms with KGX validation logs:"
find $TX_PATH -name "kgx_validate_*.log" | wc -l
printf "%10s\t" "All transforms with ROBOT measure reports:"
find $TX_PATH -name "robot.measure" | wc -l
printf "%10s\t" "All transforms with ROBOT validation reports:"
find $TX_PATH -name "robot.report" | wc -l
printf "%10s\t" "Ontologies with failed transforms:"
find $TX_PATH -maxdepth 1 -type d -exec bash -c "echo -ne '{} '; ls '{}' | wc -l" \; | awk '$NF==1{print $1}'
echo "*** Transforms with at least one of the following errors:"
for ((i=0; i < ${#all_errortypes[@]}; i++))
do
printf "%10s\t" "${all_errortypes[$i]}"
grep -r -m1 --include \*.log "${all_errortypes[$i]}" $TX_PATH | wc -l
done
echo "*** Node type counts:"
for ((i=0; i < ${#all_nodetypes[@]}; i++))
do
printf "%10s\t" "${all_nodetypes[$i]}"
grep -r -m1 --include \*_nodes.tsv "${all_nodetypes[$i]}" $TX_PATH | wc -l
done
echo "*** Edge type counts (i.e., predicate types):"
for ((i=0; i < ${#all_edgetypes[@]}; i++))
do
printf "%10s\t" "${all_edgetypes[$i]}"
grep -r -m1 --include \*_edges.tsv "${all_edgetypes[$i]}" $TX_PATH | wc -l
done
echo "Complete."