Merge pull request #200 from chhoumann/kb-288-Initial-xp-design

[KB-288] Experiment design for initial experiment
chhoumann · Jun 11, 2024 · a6cf789 · a6cf789
2 parents a431425 + bd51a55
commit a6cf789
Show file tree

Hide file tree

Showing 9 changed files with 267 additions and 83 deletions.
diff --git a/report_thesis/src/_preamble.tex b/report_thesis/src/_preamble.tex
@@ -17,6 +17,7 @@
 \usepackage{algorithm}
 \usepackage{algpseudocode}
 \usepackage{enumitem}
+\usepackage{placeins}
 \setmainfont{texgyrepagella}[
   Extension = .otf,
   UprightFont = *-regular,

diff --git a/report_thesis/src/index.tex b/report_thesis/src/index.tex
@@ -17,6 +17,7 @@ \subsubsection*{Acknowledgements:}
 \input{sections/proposed_approach/proposed_approach.tex}
 \input{sections/baseline_replica.tex}
 \input{sections/methodology.tex}
+\input{sections/experiment_design/index.tex}
 \input{sections/results/results.tex}
 
 % \subsection{Data Analysis}

diff --git a/report_thesis/src/sections/appendix.tex b/report_thesis/src/sections/appendix.tex
@@ -43,4 +43,164 @@ \subsection{Cross-Validation Plots for Major Oxides}\label{subsec:cv_plots}
         \caption{Distribution of \ce{\oxide} concentrations across cross-validation folds, training set, test set, and the entire dataset. The mean and standard deviation statistics for each partition are indicated figure.}
         \label{fig:distribution_plot_\oxide}
     \end{figure*}
-}
+}
+
+\subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_experiment_hyperparameters}
+\begin{table*}[h]
+\centering
+\begin{tabular}{@{}llp{0.5\textwidth}@{}}
+\toprule
+\textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
+\midrule
+\multirow{3}{*}{\gls{pls}}
+& \texttt{n\_components} & 34 \\
+& \texttt{scale} & True \\
+& \texttt{max\_iter} & 500 \\
+\midrule
+\multirow{6}{*}{\gls{svr}}
+& \texttt{kernel} & poly \\
+& \texttt{C} & 100 \\
+& \texttt{epsilon} & 0.1 \\
+& \texttt{gamma} & scale \\
+& \texttt{degree} & 2 \\
+& \texttt{coef0} & 1.0 \\
+\midrule
+\multirow{3}{*}{Ridge Regression}
+& \texttt{alphas} & \{$10^{-4}$, $10^{-3}$, $10^{-2}$, $10^{-1}$, 1, 10, $10^2$, $10^3$\} \\
+& \texttt{max\_iter} & 1000 \\
+& \texttt{tol} & $10^{-4}$ \\
+\midrule
+\multirow{3}{*}{\gls{lasso}}
+& \texttt{alphas} & \{$10^{-4}$, $10^{-3}$, $10^{-2}$, $10^{-1}$, 1, 10, $10^2$, $10^3$\} \\
+& \texttt{max\_iter} & 1000 \\
+& \texttt{tol} & $10^{-4}$ \\
+\midrule
+\multirow{4}{*}{\gls{enet}}
+& \texttt{alphas} & \{$10^{-4}$, $10^{-3}$, $10^{-2}$, $10^{-1}$, 1, 10, $10^2$, $10^3$\} \\
+& \texttt{l1\_ratio} & \{0.1, 0.5, 0.7, 0.9, 1.0\} \\
+& \texttt{max\_iter} & 1000 \\
+& \texttt{tol} & $10^{-4}$ \\
+\midrule
+\multirow{6}{*}{\gls{rf}}
+& \texttt{n\_estimators} & 100 \\
+& \texttt{max\_depth} & 10 \\
+& \texttt{min\_samples\_split} & 2 \\
+& \texttt{min\_samples\_leaf} & 1 \\
+& \texttt{max\_features} & sqrt \\
+& \texttt{random\_state} & 42 \\
+\midrule
+\multirow{5}{*}{\gls{etr}}
+& \texttt{n\_estimators} & 100 \\
+& \texttt{max\_depth} & 10 \\
+& \texttt{min\_samples\_split} & 2 \\
+& \texttt{min\_samples\_leaf} & 1 \\
+& \texttt{random\_state} & 42 \\
+\midrule
+\end{tabular}
+\caption{Explictly set hyperparameters for the \gls{pls}, \gls{svr}, ridge, \gls{lasso}, \gls{enet}, \gls{rf}, and \gls{etr} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:subsec:experimental_setup} are used.}
+\label{tab:combined_hyperparameters}
+\end{table*}
+
+
+\begin{table*}[h]
+\centering
+\begin{tabular}{@{}llp{0.5\textwidth}@{}}
+\toprule
+\textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
+\midrule
+\multirow{14}{*}{\gls{gbr}}
+& \texttt{n\_estimators} & 100 \\
+& \texttt{max\_depth} & 3 \\
+& \texttt{min\_samples\_split} & 2 \\
+& \texttt{min\_samples\_leaf} & 1 \\
+& \texttt{max\_features} & None \\
+& \texttt{loss} & squared\_error \\
+& \texttt{learning\_rate} & 0.1 \\
+& \texttt{subsample} & 1.0 \\
+& \texttt{criterion} & friedman\_mse \\
+& \texttt{random\_state} & 42 \\
+& \texttt{verbose} & 0 \\
+& \texttt{validation\_fraction} & 0.1 \\
+& \texttt{n\_iter\_no\_change} & None \\
+& \texttt{tol} & $10^{-4}$ \\
+& \texttt{ccp\_alpha} & 0.0 \\
+\midrule
+\gls{ngboost} & - & - \\
+\midrule
+\multirow{14}{*}{\gls{xgboost}}
+& \texttt{max\_depth} & 4 \\
+& \texttt{min\_child\_weight} & 5 \\
+& \texttt{gamma} & 0.1 \\
+& \texttt{subsample} & 0.7 \\
+& \texttt{colsample\_bytree} & 0.5 \\
+& \texttt{colsample\_bylevel} & 0.5 \\
+& \texttt{colsample\_bynode} & 0.5 \\
+& \texttt{lambda} & 1 \\
+& \texttt{alpha} & 0.5 \\
+& \texttt{learning\_rate} & 0.05 \\
+& \texttt{n\_estimators} & 100 \\
+& \texttt{objective} & reg:squarederror \\
+& \texttt{eval\_metric} & rmse \\
+\bottomrule
+\end{tabular}
+\caption{Explictly set hyperparameters for the \gls{gbr} and \gls{xgboost} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:subsec:experimental_setup} are used. The \gls{ngboost} model does not have any explicitly set hyperparameters.}
+\label{tab:combined_hyperparameters}
+\end{table*}
+
+\begin{table*}[h!]
+  \centering
+  \begin{tabular}{lll}
+    \toprule
+    \textbf{Layer} & \textbf{Output Shape} & \textbf{Hyperparameter} \\ \midrule
+    Input & (\textit{input\_dim},) & - \\
+    Dense & (1024,) & activation = ReLU \\
+    Dropout & (1024,) & rate = 0.3 \\
+    Dense & (512,) & activation = ReLU \\
+    Dropout & (512,) & rate = 0.3 \\
+    Dense & (256,) & activation = ReLU \\
+    Dense & (128,) & activation = ReLU \\
+    Output & (\textit{output\_dim},) & - \\
+    \midrule
+    \multicolumn{3}{l}{\textbf{Optimizer:} Adam} \\
+    \multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
+    \bottomrule
+  \end{tabular}
+  \caption{Summary of the Neural Network Architecture}
+  \label{tab:nn_architecture}
+\end{table*}
+
+\begin{table*}[h!]
+  \centering
+  \begin{tabular}{lll}
+    \toprule
+    \textbf{Layer} & \textbf{Output Shape} & \textbf{Hyperparameter} \\ \midrule
+    Input & (\textit{input\_dim},) & - \\
+    Reshape & (48, 128, 1) & - \\
+    Conv2D & (48, 128, 32) & filters = 32, kernel\_size = (3, 3), activation = ReLU, padding = 'same' \\
+    BatchNormalization & (48, 128, 32) & - \\
+    MaxPooling2D & (24, 64, 32) & pool\_size = (2, 2) \\ \midrule
+
+    Conv2D & (24, 64, 32) & filters = 32, kernel\_size = (3, 3), activation = ReLU, padding = 'same' \\
+    BatchNormalization & (24, 64, 32) & - \\
+    MaxPooling2D & (12, 32, 32) & pool\_size = (2, 2) \\ \midrule
+
+    Conv2D & (12, 32, 64) & filters = 64, kernel\_size = (3, 3), activation = ReLU, padding = 'same' \\
+    BatchNormalization & (12, 32, 64) & - \\
+    MaxPooling2D & (6, 16, 64) & pool\_size = (2, 2) \\ \midrule
+
+    Conv2D & (6, 16, 128) & filters = 128, kernel\_size = (3, 3), activation = ReLU, padding = 'same' \\
+    BatchNormalization & (6, 16, 128) & - \\
+    MaxPooling2D & (3, 8, 128) & pool\_size = (2, 2) \\ \midrule
+
+    Flatten & (3072,) & - \\
+    Dense & (256,) & activation = ReLU \\
+    Dropout & (256,) & rate = 0.5 \\
+    Dense & (\textit{output\_dim},) & - \\
+    Dense & (\textit{output\_dim},) & kernel\_regularizer = $L_2(0.01)$ \\ \midrule
+    \multicolumn{3}{l}{\textbf{Optimizer:} Adam} \\
+    \multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
+    \bottomrule
+  \end{tabular}
+  \caption{Summary of the Convolutional Neural Network Architecture}
+  \label{tab:cnn_architecture}
+\end{table*}
diff --git a/report_thesis/src/sections/experiment_design/data_preparation.tex b/report_thesis/src/sections/experiment_design/data_preparation.tex
@@ -0,0 +1,71 @@
+\subsection{Data Preparation}\label{sec:data-preparation}
+The first step in our methodology is to prepare the datasets for model training and evaluation.
+As mentioned in Section~\ref{sec:data-overview}, the data used in this study was obtained from NASA's \gls{pds} and consists of \gls{ccs} data and major oxide compositions for various samples.
+
+The initial five shots from each sample are excluded because they are usually contaminated by dust covering the sample, which is cleared away by the shock waves produced by the laser \cite{cleggRecalibrationMarsScience2017}.
+The remaining 45 shots from each location are then averaged, yielding a single spectrum $s$ per location $l$ in the Averaged Intensity Tensor\ref{matrix:averaged_intensity}, resulting in a total of five spectra for each sample. 
+
+At this stage, the data still contains noise at the edges of the spectrometers.
+These edges correspond to the boundaries of the three spectrometers, which collectively cover the \gls{uv}, \gls{vio}, and \gls{vnir} light spectra.
+The noisy edge ranges are as follows: 240.811-246.635 nm, 338.457-340.797 nm, 382.138-387.859 nm, 473.184-492.427 nm, and 849-905.574 nm.
+In addition to being noisy regions, these regions do not contain any useful information related to each of the major oxides.
+Consequently, these regions are masked by zeroing out the values, rather than removing them, as they represent meaningful variation in the data~\cite{cleggRecalibrationMarsScience2017}.
+
+Additionally, as a result of the aforementioned preprocessing applied to the raw \gls{libs} data, negative values are present in the \gls{ccs} data.
+These negative values are not physically meaningful, since you cannot have negative light intensity \cite{p9_paper}.
+Similar to the noisy edges, these negative values are also masked by zeroing out the values.
+
+We transpose the data so that each row represents a location and each column represents a wavelength feature. 
+Each location is now represented as a vector of wavelengths, with the corresponding average intensity values for each wavelength. 
+These vectors are then concatenated to form a tensor, giving us the full Averaged Intensity Tensor.
+
+For each sample, we have a corresponding set of major oxide compositions in weight percentage (wt\%).
+These compositions are used as the target labels for the machine learning models.
+An excerpt of this data is shown in Table \ref{tab:composition_data_example}.
+While the \textit{Target}, \textit{Spectrum Name}, and \textit{Sample Names} are part of the dataset, our analysis focuses primarily on the \textit{Sample Names}.
+The concentrations of the eight oxides \ce{SiO2}, \ce{TiO2}, \ce{Al2O3}, \ce{FeO_T}, \ce{MnO}, \ce{MgO}, \ce{CaO}, \ce{Na2O}, and \ce{K2O} represent the expected values for these oxides in the sample, serving as our ground truth. The \textit{MOC total} is not utilized in this study.
+
+\begin{table*}[h]
+\centering
+\begin{tabular}{lllllllllllll}
+\toprule
+     Target & Spectrum Name & Sample Name & \ce{SiO2} & \ce{TiO2} & \ce{Al2O3} & \ce{FeO_T} & \ce{MnO} & \ce{MgO} & \ce{CaO} & \ce{Na2O} & \ce{K2O} & \ce{MOC total} \\
+\midrule
+AGV2 & AGV2 & AGV2 & 59.3 & 1.05 & 16.91 & 6.02 & 0.099 & 1.79 & 5.2 & 4.19 & 2.88 & 97.44 \\
+BCR-2 & BCR2 & BCR2 & 54.1 & 2.26 & 13.5 & 12.42 & 0.2 & 3.59 & 7.12 & 3.16 & 1.79 & 98.14 \\
+$\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ \\
+TB & --- & --- & 60.23 & 0.93 & 20.64 & 11.6387 & 0.052 & 1.93 & 0.000031 & 1.32 & 3.87 & 100.610731 \\
+    TB2 & --- & --- & 60.4 & 0.93 & 20.5 & 11.6536 & 0.047 & 1.86 & 0.2 & 1.29 & 3.86 & 100.7406 \\
+\bottomrule
+\end{tabular}
+\caption{Excerpt from the composition dataset (from \citet{p9_paper})}
+\label{tab:composition_data_example}
+\end{table*}
+
+The major oxide weight percentages are appended to the matrix of spectral data, forming the final dataset.
+This dataset is shown in Table~\ref{tab:final_dataset_example}.
+The \textit{Target} column corresponds to the sample name, while the \textit{ID} column contains the unique identifier for each location.
+
+\begin{table*}[h]
+\centering
+\footnotesize
+\begin{tabular}{llllllllllllllllllllll}
+\toprule
+    240.81   & $\cdots$     & 425.82    & 425.87   & $\cdots$ & 905.57  & \ce{SiO2} & \ce{TiO2} & \ce{Al2O3} & \ce{FeO_T} & \ce{MgO} & \ce{CaO} & \ce{Na2O} & \ce{K2O} & Target     & ID \\
+\midrule
+	0        & $\cdots$     & 1.53e+10 & 1.62e+10 & $\cdots$ & 0        & 56.13     & 0.69 & 17.69 & 5.86 & 3.85 & 7.07 & 3.32 & 1.44 & jsc1421     & jsc1421\_2013\_09\_12\_211002\_ccs \\
+	0        & $\cdots$     & 1.28e+10 & 1.30e+10 & $\cdots$ & 0        & 56.13     & 0.69 & 17.69 & 5.86 & 3.85 & 7.07 & 3.32 & 1.44 & jsc1421     & jsc1421\_2013\_09\_12\_211143\_ccs \\
+    0        & $\cdots$     & 1.87e+10 & 1.83e+10 & $\cdots$ & 0        & 56.13     & 0.69 & 17.69 & 5.86 & 3.85 & 7.07 & 3.32 & 1.44 & jsc1421     & jsc1421\_2013\_09\_12\_210628\_ccs \\
+    0        & $\cdots$     & 1.77e+10 & 1.78e+10 & $\cdots$ & 0        & 56.13     & 0.69 & 17.69 & 5.86 & 3.85 & 7.07 & 3.32 & 1.44 & jsc1421     & jsc1421\_2013\_09\_12\_210415\_ccs \\
+    0        & $\cdots$     & 1.75e+10 & 1.79e+10 & $\cdots$ & 0        & 56.13     & 0.69 & 17.69 & 5.86 & 3.85 & 7.07 & 3.32 & 1.44 & jsc1421     & jsc1421\_2013\_09\_12\_210811\_ccs \\
+    0        & $\cdots$     & 5.52e+10 & 3.74e+10 & $\cdots$ & 0        & 57.60     & 0.78 & 26.60 & 2.73 & 0.70 & 0.01 & 0.38 & 7.10 & pg7         & pg7\_2013\_11\_07\_161903\_ccs \\
+    0        & $\cdots$     & 5.09e+10 & 3.41e+10 & $\cdots$ & 0        & 57.60     & 0.78 & 26.60 & 2.73 & 0.70 & 0.01 & 0.38 & 7.10 & pg7         & pg7\_2013\_11\_07\_162038\_ccs \\
+    0        & $\cdots$     & 5.99e+10 & 3.97e+10 & $\cdots$ & 0        & 57.60     & 0.78 & 26.60 & 2.73 & 0.70 & 0.01 & 0.38 & 7.10 & pg7         & pg7\_2013\_11\_07\_161422\_ccs \\
+    0        & $\cdots$     & 5.22e+10 & 3.47e+10 & $\cdots$ & 0        & 57.60     & 0.78 & 26.60 & 2.73 & 0.70 & 0.01 & 0.38 & 7.10 & pg7         & pg7\_2013\_11\_07\_161735\_ccs \\
+    0        & $\cdots$     & 5.29e+10 & 3.62e+10 & $\cdots$ & 0        & 57.60     & 0.78 & 26.60 & 2.73 & 0.70 & 0.01 & 0.38 & 7.10 & pg7         & pg7\_2013\_11\_07\_161552\_ccs \\
+	$\vdots$ & $\cdots$ & $\vdots$ & $\vdots$ & $\cdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ \\
+\midrule
+\end{tabular}
+\caption{Excerpt from the final dataset (values have been rounded to two decimal places for brevity).}
+\label{tab:final_dataset_example}
+\end{table*}
diff --git a/report_thesis/src/sections/experiment_design/experimental_setup.tex b/report_thesis/src/sections/experiment_design/experimental_setup.tex
@@ -0,0 +1,7 @@
+\subsection{Experimental Setup}\label{sec:experimental_setup}
+Experiments were conducted on a machine equipped with an Intel Xeon Gold 6242 CPU, featuring 16 cores and 32 threads.
+The CPU has a base clock speed of 2.80 GHz and a maximum turbo frequency of 3.90 GHz.
+The system has 64 GB of RAM and runs on Ubuntu 22.04.2 LTS.
+Models were implemented using Python 3.10.11.
+The primary libraries used were Scikit-learn 1.4.2, XGBoost 2.0.3, Torch 2.2.2, NumPy 1.26.4, Pandas 2.2.1, Keras 3.2.1 and Optuna 3.6.1.
+Additionally, all experiments were run using the hyperparameter optimization tool described in Section~\ref{sec:optimization_framework}.
diff --git a/report_thesis/src/sections/experiment_design/index.tex b/report_thesis/src/sections/experiment_design/index.tex
@@ -0,0 +1,11 @@
+\section{Experimental Design}\label{sec:methodology}
+This section outlines the experimental design used to identify the top-$n$ models to be used in our stacking ensemble.
+We first describe the prequisite data preparation for all our experiments, followed by a description of the hardware and software used.
+Next, we outline the design of our initial experiment, aimed at providing a preliminary assesment of the models selected in Section~\ref{subsec:model-selection}.
+Following this, we present and discuss the results of this experiment.
+We then describe the design of our main experiment, where we use our hyperparameter tuning framework to identify the top-$n$ models.
+Finally, we present and discuss the results of this experiment.
+
+\input{sections/experiment_design/data_preparation}
+\input{sections/experiment_design/experimental_setup}
+\input{sections/experiment_design/initial_experiment}
diff --git a/report_thesis/src/sections/experiment_design/initial_experiment.tex b/report_thesis/src/sections/experiment_design/initial_experiment.tex
@@ -0,0 +1,14 @@
+\subsection{Design for Initial Experiment}\label{sec:initial-experiment}
+As described in Section~\ref{sec:proposed_approach}, we conducted a series of initial experiments to evaluate the performance of various machine learning models on the prediction of major oxide compositions from our \gls{libs} dataset.
+These experiments aimed to provide a preliminary assessment of the models' performance, allowing us to identify the most promising models for further evaluation and inclusion in our stacking ensemble.
+All models were trained on the same preprocessed data using the Norm 3 preprocessing method described in Section~\ref{sec:norm3}.
+This ensured that the models' performance could be evaluated under consistent and comparable conditions.
+
+Furthermore, all experiments used our data partitioning and were evaluated using our testing and validation strategy, as described in Section~\ref{subsec:validation_testing_procedures}.
+To ensure as fair of a comparison between models as possible, all models were trained using as many default hyperparameters as possible, and those hyperparameters that did not have default options were selected based on values found in the literature.
+However, due to the nature of the neural network models' architecture, some extra time was spent on tuning the models to ensure a fair comparison.
+This included using batch normalization for the \gls{cnn} model, as early assesments showed that this was necessary to produce reasonable results.
+Finally, we evaluated each model once per oxide given the selected configuration of hyperparameters. 
+As stated, the goal of this experiment was merely to get an initial indication of the performance of the models.
+
+The hyperparameters used for the models in the initial experiment can be found in the Appendix~\ref{subsec:initial_experiment_hyperparameters}.