Merge branch 'main' into acknowledgements

chhoumann · Jun 12, 2024 · 61d178f · 61d178f
2 parents a8e9530 + 56d5244
commit 61d178f
Show file tree

Hide file tree

Showing 40 changed files with 734 additions and 225 deletions.
diff --git a/baseline/eda/compare_old_vs_new_test_sets.ipynb b/baseline/eda/compare_old_vs_new_test_sets.ipynb
diff --git a/baseline/experiment_analysis/experiment_analysis.ipynb b/baseline/experiment_analysis/experiment_analysis.ipynb
@@ -2378,12 +2378,12 @@
     "    latex_table = unique_model_types_df.to_latex(index=False, escape=False)\n",
     "\n",
     "    with open(path, \"w\") as file:\n",
-    "        file.write(\"\\\\begin{table*}[htbp]\\n\")\n",
+    "        file.write(\"\\\\begin{table}[!htb]\\n\")\n",
     "        file.write(\"\\\\centering\\n\")\n",
+    "        file.write(\"\\\\caption{Overview of model types for \\\\ce{\" + oxide_name + \"} oxide}.\\n\")\n",
     "        file.write(latex_table)\n",
-    "        file.write(\"\\\\caption{Overview of model types for \\\\ce{\" + oxide_name + \"} oxide}\\n\")\n",
     "        file.write(\"\\\\label{tab:\" + oxide + \"_overview}\\n\")\n",
-    "        file.write(\"\\\\end{table*}\\n\")\n"
+    "        file.write(\"\\\\end{table}\\n\")\n"
    ]
   },
   {
@@ -4963,9 +4963,7 @@
     "sns.boxplot(x='params.model_type', y='metrics.std_dev_cv', data=filtered_runs_target)\n",
     "plt.title(f'{analysis_target}: Standard Deviation of Errors (CV) by Model Type')\n",
     "plt.ylabel('Standard Deviation of Errors (CV)')\n",
-    "plt.show()\n",
-    "\n",
-    "\n"
+    "plt.show()\n"
    ]
   },
   {
@@ -5233,9 +5231,7 @@
     "    for p in top_configurations.patches:\n",
     "        width = p.get_width()\n",
     "        plt.text(width + 0.01, p.get_y()+0.2 + p.get_height() / 2, f'{width:.2f}', ha='left', va='center')\n",
-    "    plt.show()\n",
-    "\n",
-    "\n"
+    "    plt.show()\n"
    ]
   },
   {

diff --git a/report_thesis/src/_preamble.tex b/report_thesis/src/_preamble.tex
@@ -60,5 +60,5 @@
   \city{Aalborg}
   \country{Denmark}
 }
-\received{date}
+\received{13/06/2024}
 \begin{document}
diff --git a/report_thesis/src/glossary.tex b/report_thesis/src/glossary.tex
@@ -1,3 +1,4 @@
+\newacronym{nasa}{NASA}{National Aeronautics and Space Administration}
 \newacronym{msl}{MSL}{Mars Science Laboratory}
 \newacronym{libs}{LIBS}{Laser-Induced Breakdown Spectroscopy}
 \newacronym{chemcam}{ChemCam}{Chemistry and Camera}
@@ -57,4 +58,6 @@
 \newacronym{rss}{RSS}{Residual Sum of Squares}
 \newacronym{tpe}{TPE}{Tree-structured Parzen Estimator}
 \newacronym{usgs}{USGS}{United States Geological Survey}
-\newacronym{pyhat}{pyhat}{Python Hyperspectral Analysis Tool}
+\newacronym{pyhat}{PyHAT}{Python Hyperspectral Analysis Tool}
+\newacronym{jade}{JADE}{Joint Approximation Diagonalization of Eigen-matrices}
+\newacronym{gui}{GUI}{Graphical User Interface}
diff --git a/report_thesis/src/index.tex b/report_thesis/src/index.tex
@@ -2,8 +2,16 @@
 
 \newpage
 
+\input{sections/summary.tex}
+
+\newpage
+
 \begin{abstract}
-    Abstract
+This thesis advances the analysis of \gls{libs} data for predicting major oxide compositions in geological samples.
+By integrating machine learning techniques and ensemble regression models, the study addresses challenges like high dimensionality, multicollinearity, and limited data availability.
+Key innovations include the use of stacked generalization for improved model performance and an automated hyperparameter optimization framework.
+The research contributes a comprehensive catalog of models and preprocessing techniques, and integrates findings into the \gls{pyhat} by the \gls{usgs}, enhancing its scientific capabilities.
+This work lays a robust foundation for future advancements in geochemical analysis and planetary exploration using \gls{libs} data.
 \end{abstract}
 
 \maketitle
@@ -19,5 +27,6 @@ \subsubsection*{Acknowledgements:} We would like to thank our supervisors Daniel
 \input{sections/proposed_approach/proposed_approach.tex}
 \input{sections/methodology.tex}
 \input{sections/experiments/index.tex}
-\input{sections/future_work.tex}
+\input{sections/pyhat_contribution.tex}
 \input{sections/conclusion.tex}
+\input{sections/future_work.tex}
diff --git a/report_thesis/src/sections/appendix/index.tex b/report_thesis/src/sections/appendix/index.tex
@@ -1,15 +1,20 @@
-\section*{Appendix}
-\addcontentsline{toc}{section}{Appendix}
+\onecolumn
+\section{Appendix}
+\renewcommand{\thesection}{\Alph{section}}
+\renewcommand{\thefigure}{\Alph{section}.\arabic{figure}}
+\renewcommand{\thetable}{\Alph{section}.\arabic{table}}
+\setcounter{figure}{0}
+\setcounter{table}{0}
 
 \subsection{Web-Based Platform for Data Partitioning Evaluation}\label{subsec:web_platform}
-
-\begin{figure*}
+\begin{figure}[!htb]
     \centering
-    \includegraphics[width=0.7\textwidth]{images/web_platform.png}
+    \includegraphics[width=0.585\textwidth]{images/web_platform.png}
     \caption{Web-based platform used to determine the optimal value of $p$ for the data partitioning algorithm.}
     \label{fig:web_platform}
-\end{figure*}
+\end{figure}
 
+\newpage
 \FloatBarrier
 
 \subsection{Cross-Validation Fold Plots for Major Oxides}\label{subsec:cv_plots}
@@ -18,40 +23,41 @@ \subsection{Cross-Validation Fold Plots for Major Oxides}\label{subsec:cv_plots}
 \foreach \oxide in {SiO2, TiO2, Al2O3, FeOT, MgO, CaO, Na2O, K2O} {
     \subsubsection{\oxide}
 
-    \begin{figure*}
+    \begin{figure}[!htb]
         \centering
         \includegraphics[width=\textwidth]{images/\oxide/histogram_grid_plot.png}
         \caption{Histogram and \gls{kde} of \ce{\oxide} distribution in each fold. The y-axis represents the count of samples per bin, and the x-axis represents \ce{\oxide} concentration. The notation in the legend indicates the amount of instances in the training/validation sets.}
         \label{fig:histogram_grid_plot_\oxide}
-    \end{figure*}
+    \end{figure}
 
-    \begin{figure*}
+    \begin{figure}[!htb]
         \centering
         \includegraphics[width=\textwidth]{images/\oxide/histogram_kde_plot.png}
         \caption{Combined Histogram and \gls{kde} of \ce{\oxide} distribution in each fold. The y-axis represents the count of samples per bin, and the x-axis represents \ce{\oxide} concentration. The notation in the legend indicates the amount of instances in the training/validation sets.}
         \label{fig:histogram_kde_plot_\oxide}
-    \end{figure*}
+    \end{figure}
 
-    \begin{figure*}
+    \begin{figure}[!htb]
         \centering
         \includegraphics[width=\textwidth]{images/\oxide/original_and_post_fold.png}
         \caption{Distribution of \ce{\oxide} concentrations before and after fold assignment. The left plot shows the original distribution of \ce{\oxide}, while the right plot shows the distribution with folds assigned, color-coded to indicate the different folds.}
         \label{fig:original_and_post_fold_plot_\oxide}
-    \end{figure*}
+    \end{figure}
 
-    \begin{figure*}
+    \begin{figure}[!htb]
         \centering
         \includegraphics[width=\textwidth]{images/\oxide/distribution_plot.png}
         \caption{Distribution of \ce{\oxide} concentrations across cross-validation folds, training set, test set, and the entire dataset. The mean and standard deviation statistics for each partition are indicated figure.}
         \label{fig:distribution_plot_\oxide}
-    \end{figure*}
+    \end{figure}
 }
 
 \FloatBarrier
 
 \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_experiment_hyperparameters}
-\begin{table*}
+\begin{table}[!htb]
 \centering
+\caption{Explictly set hyperparameters for the \gls{pls}, \gls{svr}, ridge, \gls{lasso}, \gls{enet}, \gls{rf}, and \gls{etr} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used.}
 \begin{tabular}{@{}llp{0.5\textwidth}@{}}
 \toprule
 \textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
@@ -101,14 +107,14 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
 & \texttt{random\_state} & 42 \\
 \midrule
 \end{tabular}
-\caption{Explictly set hyperparameters for the \gls{pls}, \gls{svr}, ridge, \gls{lasso}, \gls{enet}, \gls{rf}, and \gls{etr} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used.}
 \label{tab:combined_hyperparameters}
-\end{table*}
+\end{table}
 
 \FloatBarrier
 
-\begin{table*}
+\begin{table}[!htb]
 \centering
+\caption{Explictly set hyperparameters for the \gls{gbr} and \gls{xgboost} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used. The \gls{ngboost} model does not have any explicitly set hyperparameters.}
 \begin{tabular}{@{}llp{0.5\textwidth}@{}}
 \toprule
 \textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
@@ -148,13 +154,12 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
 & \texttt{eval\_metric} & rmse \\
 \bottomrule
 \end{tabular}
-\caption{Explictly set hyperparameters for the \gls{gbr} and \gls{xgboost} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used. The \gls{ngboost} model does not have any explicitly set hyperparameters.}
 \label{tab:combined_hyperparameters}
-\end{table*}
+\end{table}
 
 \FloatBarrier
 
-\begin{table*}
+\begin{table}[!htb]
   \centering
   \begin{tabular}{lll}
     \toprule
@@ -172,12 +177,15 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
     \multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
     \bottomrule
   \end{tabular}
-  \caption{Summary of the Neural Network Architecture}
+  \caption{Summary of the \gls{ann} architecture.}
   \label{tab:nn_architecture}
-\end{table*}
+\end{table}
+
+\FloatBarrier
 
-\begin{table*}
+\begin{table}[!htb]
   \centering
+  \caption{Summary of the \gls{cnn} architecture.}
   \begin{tabular}{lll}
     \toprule
     \textbf{Layer} & \textbf{Output Shape} & \textbf{Hyperparameter} \\ \midrule
@@ -208,12 +216,13 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
     \multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
     \bottomrule
   \end{tabular}
-  \caption{Summary of the Convolutional Neural Network Architecture}
   \label{tab:cnn_architecture}
-\end{table*}
+\end{table}
 
 \FloatBarrier
 
+\clearpage
+
 \subsection{Overview of best performing model configurations}\label{subsec:best_model_configurations}
 \input{sections/appendix/tables/SiO2_overview.tex}
 \input{sections/appendix/tables/TiO2_overview.tex}
@@ -223,108 +232,3 @@ \subsection{Overview of best performing model configurations}\label{subsec:best_
 \input{sections/appendix/tables/CaO_overview.tex}
 \input{sections/appendix/tables/Na2O_overview.tex}
 \input{sections/appendix/tables/K2O_overview.tex}
-
-\FloatBarrier
-
-\subsection{Stacking Ensemble 1:1 Plots}\label{subsec:1_1_plots}
-\begin{figure*}
-    \centering
-    \resizebox{0.75\textwidth}{!}{
-        \begin{tabular}{cc}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/SiO2.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/TiO2.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/Al2O3.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/FeOT.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/MgO.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/CaO.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/Na2O.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/elasticnet/K2O.png}
-            \end{subfigure}
-        \end{tabular}
-    }
-    \caption{One-to-one plots for the stacking ensemble model with the \gls{enet} as the meta-learner with $\alpha = 1$}
-    \label{fig:elasticnet_one_to_one}
-\end{figure*}
-
-\begin{figure*}
-    \centering
-    \resizebox{0.75\textwidth}{!}{
-        \begin{tabular}{cc}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/SiO2.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/TiO2.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/Al2O3.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/FeOT.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/MgO.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/CaO.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/Na2O.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/K2O.png}
-            \end{subfigure}
-        \end{tabular}
-    }
-    \caption{One-to-one plots for the stacking ensemble model with the \gls{enet} as the meta-learner with $\alpha = 0.1$.}
-    \label{fig:enetalpha01_one_to_one}
-\end{figure*}
-
-\begin{figure*}
-    \centering
-    \resizebox{0.75\textwidth}{!}{
-        \begin{tabular}{cc}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/SiO2.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/TiO2.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/Al2O3.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/FeOT.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/MgO.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/CaO.png}
-            \end{subfigure} \\
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/Na2O.png}
-            \end{subfigure} & \hspace{3cm}
-            \begin{subfigure}{0.5\textwidth}
-                \includegraphics[width=\textwidth]{images/one_to_one/svr/K2O.png}
-            \end{subfigure}
-        \end{tabular}
-    }
-    \caption{One-to-one plots for the stacking ensemble model with the \gls{svr} as the meta-learner}
-    \label{fig:svr_one_to_one}
-\end{figure*}
diff --git a/report_thesis/src/sections/appendix/tables/Al2O3_overview.tex b/report_thesis/src/sections/appendix/tables/Al2O3_overview.tex
@@ -1,5 +1,6 @@
-\begin{table*}
+\begin{table}[!htb]
 \centering
+\caption{Overview of model types for \ce{Al2O3} oxide.}
 \begin{tabular}{llllllll}
 \toprule
 \ce{Al2O3} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
@@ -16,6 +17,5 @@
  & \texttt{random\_forest} & \texttt{power\_transformer} & \texttt{none} & \texttt{max\_abs\_scaler} & 2.302 & 2.295 & 2.111 \\
 \bottomrule
 \end{tabular}
-\caption{Overview of model types for \ce{Al2O3} oxide}
 \label{tab:Al2O3_overview}
-\end{table*}
+\end{table}
diff --git a/report_thesis/src/sections/appendix/tables/CaO_overview.tex b/report_thesis/src/sections/appendix/tables/CaO_overview.tex
@@ -1,5 +1,6 @@
-\begin{table*}
+\begin{table}[!htb]
 \centering
+\caption{Overview of model types for \ce{CaO} oxide.}
 \begin{tabular}{llllllll}
 \toprule
 \ce{CaO} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
@@ -16,6 +17,5 @@
  & \texttt{lasso} & \texttt{power\_transformer} & \texttt{none} & \texttt{min\_max\_scaler} & 1.529 & 1.514 & 1.684 \\
 \bottomrule
 \end{tabular}
-\caption{Overview of model types for \ce{CaO} oxide}
 \label{tab:CaO_overview}
-\end{table*}
+\end{table}
diff --git a/report_thesis/src/sections/appendix/tables/FeOT_overview.tex b/report_thesis/src/sections/appendix/tables/FeOT_overview.tex
@@ -1,5 +1,6 @@
-\begin{table*}
+\begin{table}[!htb]
 \centering
+\caption{Overview of model types for \ce{FeO_T} oxide.}
 \begin{tabular}{llllllll}
 \toprule
 \ce{FeO_T} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
@@ -16,6 +17,5 @@
  & \texttt{random\_forest} & \texttt{quantile\_transformer} & \texttt{none} & \texttt{norm3\_scaler} & 3.079 & 3.044 & 2.018 \\
 \bottomrule
 \end{tabular}
-\caption{Overview of model types for \ce{FeO_T} oxide}
 \label{tab:FeOT_overview}
-\end{table*}
+\end{table}