-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexecuteJupyterNotebookReport.sh
executable file
·147 lines (122 loc) · 6.38 KB
/
executeJupyterNotebookReport.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env bash
# Executes the given Jupyter Notebook and puts all resulting files (ipynb, md, pdf) into an accordingly named directory within the "results" directory.
#
# Command line options:
# --jupyterNotebook: Name of the Jupyter Notebook file including its file extension relative to the "jupyter" directory (required)
# --reportName: nameOfTheReportsDirectory (optional, default = kebab cased name of the Jupyter Notebook file)
# Requires executeQueryFunctions.sh, executeJupyterNotebook.sh, cleanupAfterReportGeneration.sh
# Override-able constants (defaults also defined in sub scripts)
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
set -o errexit -o pipefail
# Function to display script usage
usage() {
echo "Usage: $0 --jupyterNotebook nameOfTheJupyterNotebook [--reportName nameOfTheReportsDirectory]"
echo "Example: $0 --jupyterNotebook ArtifactDependencies.ipynb"
exit 1
}
# Converts the given camel case file name (basename) to kebab case (with dashed in between)
# Parameters:
# - File name in camel case
camel_to_kebab_case_file_name() {
basename "${1%.*}" | sed -r 's/([a-z0-9])([A-Z])/\1-\2/g' | tr '[:upper:]' '[:lower:]'
}
# Returns the value of the Jupyter Notebook custom metadata property "code_graph_analysis_pipeline_data_validation"
# or an empty string if it doesn't exist.
# Parameters
# - Jupyter Notebook file name, e.g. ${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}
get_data_validation_from_jupyter_metadata() {
grep -m1 -o '"code_graph_analysis_pipeline_data_validation":\s*"[^"]*"' "${1}" | cut -d '"' -f 4 || true
}
# Uses "get_data_validation_from_jupyter_metadata" to extract the name of the
# data validation Cypher query out of the Jupyter Notebook file given as first parameter.
# The equally named Cypher query file is then loaded from the Cypher directory given as second parameter
# and the "Validation" directory in it.
# This Cypher query is then executed. If there is at least one result, then the validation is considered successful.
#
# Parameters
# - Jupyter Notebook file name, e.g. ${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}
# - Cypher query directory, e.g. ${CYPHER_DIR}
validate_data_available() {
local jupyterNotebookFile="${1}"
local cypherDirectory="${2}"
dataValidation=$(get_data_validation_from_jupyter_metadata "${jupyterNotebookFile}")
if [ -z "${dataValidation}" ] ; then
echo "executeJupyterNotebookReport: Skipping data validation. Jupyter Notebook ${jupyterNotebookFile} has no 'code_graph_analysis_pipeline_data_validation' metadata property."
return 0
fi
echo "executeJupyterNotebookReport: dataValidation=${dataValidation}"
local dataValidationCypherQuery="${cypherDirectory}/Validation/${dataValidation}.cypher"
if [ ! -f "${dataValidationCypherQuery}" ] ; then
echo "executeJupyterNotebookReport: Error: Validation Cypher Query file ${dataValidationCypherQuery} doesn't exist."
exit 1
fi
echo "executeJupyterNotebookReport: Validating data using Cypher query ${dataValidationCypherQuery} ..."
local dataValidationResult
dataValidationResult=$( execute_cypher_http_number_of_lines_in_result "${dataValidationCypherQuery}" )
if [[ "${dataValidationResult}" -ge 1 ]]; then
echo "executeJupyterNotebookReport: Validation succeeded."
true;
else
echo "executeJupyterNotebookReport: Validation failed. No data from query ${dataValidationCypherQuery}."
false;
fi
}
# Default values
reportName=""
jupyterNotebook=""
# Parse command line arguments
while [[ $# -gt 0 ]]; do
commandLineOption="${1}"
case ${commandLineOption} in
--jupyterNotebook)
jupyterNotebook="${2}"
shift
;;
--reportName)
reportName="${2}"
shift
;;
*)
echo "executeJupyterNotebookReport: Error: Unknown option: ${commandLineOption}"
usage
;;
esac
shift
done
if [[ -z ${jupyterNotebook} ]]; then
echo "${USAGE}"
exit 1
fi
if [[ -z ${reportName} ]]; then
reportName=$(camel_to_kebab_case_file_name "${jupyterNotebook}")
echo "executeJupyterNotebookReport: reportName defaults to ${reportName}"
fi
## Get this "scripts" directory if not already set
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts
echo "executeJupyterNotebookReport: SCRIPTS_DIR=${SCRIPTS_DIR}"
# Get the "scripts" directory by taking the path of this script and going one directory up.
REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-"${SCRIPTS_DIR}/reports"} # Repository directory containing the report scripts
echo "executeJupyterNotebookReport: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
# Get the "jupyter" directory by taking the path of this script and going two directory up and then to "jupyter".
JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY:-"${SCRIPTS_DIR}/../jupyter"} # Repository directory containing the Jupyter Notebooks
echo "executeJupyterNotebookReport: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DIRECTORY}"
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"}
echo "executeJupyterNotebookReport CYPHER_DIR=${CYPHER_DIR}"
# Define functions to execute cypher queries from within a given file, like e.g. "get_data_validation_from_jupyter_metadata"
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
# Create report directory
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${reportName}"
mkdir -p "${FULL_REPORT_DIRECTORY}"
if validate_data_available "${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}" "${CYPHER_DIR}"; then
# Execute and convert the given Jupyter Notebook within the given reports directory
(cd "${FULL_REPORT_DIRECTORY}" && exec "${SCRIPTS_DIR}/executeJupyterNotebook.sh" "${JUPYTER_NOTEBOOK_DIRECTORY}/${jupyterNotebook}")
else
echo "executeJupyterNotebookReport: Skipping Jupyter Notebook ${jupyterNotebook} because of missing data."
fi
# Clean-up after report generation. Empty reports will be deleted.
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"