Skip to content

Commit

Permalink
Merge branch 'main' into acknowledgements
Browse files Browse the repository at this point in the history
  • Loading branch information
chhoumann committed Jun 12, 2024
2 parents a8e9530 + 56d5244 commit 61d178f
Show file tree
Hide file tree
Showing 40 changed files with 734 additions and 225 deletions.
186 changes: 186 additions & 0 deletions baseline/eda/compare_old_vs_new_test_sets.ipynb

Large diffs are not rendered by default.

14 changes: 5 additions & 9 deletions baseline/experiment_analysis/experiment_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2378,12 +2378,12 @@
" latex_table = unique_model_types_df.to_latex(index=False, escape=False)\n",
"\n",
" with open(path, \"w\") as file:\n",
" file.write(\"\\\\begin{table*}[htbp]\\n\")\n",
" file.write(\"\\\\begin{table}[!htb]\\n\")\n",
" file.write(\"\\\\centering\\n\")\n",
" file.write(\"\\\\caption{Overview of model types for \\\\ce{\" + oxide_name + \"} oxide}.\\n\")\n",
" file.write(latex_table)\n",
" file.write(\"\\\\caption{Overview of model types for \\\\ce{\" + oxide_name + \"} oxide}\\n\")\n",
" file.write(\"\\\\label{tab:\" + oxide + \"_overview}\\n\")\n",
" file.write(\"\\\\end{table*}\\n\")\n"
" file.write(\"\\\\end{table}\\n\")\n"
]
},
{
Expand Down Expand Up @@ -4963,9 +4963,7 @@
"sns.boxplot(x='params.model_type', y='metrics.std_dev_cv', data=filtered_runs_target)\n",
"plt.title(f'{analysis_target}: Standard Deviation of Errors (CV) by Model Type')\n",
"plt.ylabel('Standard Deviation of Errors (CV)')\n",
"plt.show()\n",
"\n",
"\n"
"plt.show()\n"
]
},
{
Expand Down Expand Up @@ -5233,9 +5231,7 @@
" for p in top_configurations.patches:\n",
" width = p.get_width()\n",
" plt.text(width + 0.01, p.get_y()+0.2 + p.get_height() / 2, f'{width:.2f}', ha='left', va='center')\n",
" plt.show()\n",
"\n",
"\n"
" plt.show()\n"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion report_thesis/src/_preamble.tex
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@
\city{Aalborg}
\country{Denmark}
}
\received{date}
\received{13/06/2024}
\begin{document}
5 changes: 4 additions & 1 deletion report_thesis/src/glossary.tex
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
\newacronym{nasa}{NASA}{National Aeronautics and Space Administration}
\newacronym{msl}{MSL}{Mars Science Laboratory}
\newacronym{libs}{LIBS}{Laser-Induced Breakdown Spectroscopy}
\newacronym{chemcam}{ChemCam}{Chemistry and Camera}
Expand Down Expand Up @@ -57,4 +58,6 @@
\newacronym{rss}{RSS}{Residual Sum of Squares}
\newacronym{tpe}{TPE}{Tree-structured Parzen Estimator}
\newacronym{usgs}{USGS}{United States Geological Survey}
\newacronym{pyhat}{pyhat}{Python Hyperspectral Analysis Tool}
\newacronym{pyhat}{PyHAT}{Python Hyperspectral Analysis Tool}
\newacronym{jade}{JADE}{Joint Approximation Diagonalization of Eigen-matrices}
\newacronym{gui}{GUI}{Graphical User Interface}
13 changes: 11 additions & 2 deletions report_thesis/src/index.tex
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@

\newpage

\input{sections/summary.tex}

\newpage

\begin{abstract}
Abstract
This thesis advances the analysis of \gls{libs} data for predicting major oxide compositions in geological samples.
By integrating machine learning techniques and ensemble regression models, the study addresses challenges like high dimensionality, multicollinearity, and limited data availability.
Key innovations include the use of stacked generalization for improved model performance and an automated hyperparameter optimization framework.
The research contributes a comprehensive catalog of models and preprocessing techniques, and integrates findings into the \gls{pyhat} by the \gls{usgs}, enhancing its scientific capabilities.
This work lays a robust foundation for future advancements in geochemical analysis and planetary exploration using \gls{libs} data.
\end{abstract}

\maketitle
Expand All @@ -19,5 +27,6 @@ \subsubsection*{Acknowledgements:} We would like to thank our supervisors Daniel
\input{sections/proposed_approach/proposed_approach.tex}
\input{sections/methodology.tex}
\input{sections/experiments/index.tex}
\input{sections/future_work.tex}
\input{sections/pyhat_contribution.tex}
\input{sections/conclusion.tex}
\input{sections/future_work.tex}
166 changes: 35 additions & 131 deletions report_thesis/src/sections/appendix/index.tex
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
\section*{Appendix}
\addcontentsline{toc}{section}{Appendix}
\onecolumn
\section{Appendix}
\renewcommand{\thesection}{\Alph{section}}
\renewcommand{\thefigure}{\Alph{section}.\arabic{figure}}
\renewcommand{\thetable}{\Alph{section}.\arabic{table}}
\setcounter{figure}{0}
\setcounter{table}{0}

\subsection{Web-Based Platform for Data Partitioning Evaluation}\label{subsec:web_platform}

\begin{figure*}
\begin{figure}[!htb]
\centering
\includegraphics[width=0.7\textwidth]{images/web_platform.png}
\includegraphics[width=0.585\textwidth]{images/web_platform.png}
\caption{Web-based platform used to determine the optimal value of $p$ for the data partitioning algorithm.}
\label{fig:web_platform}
\end{figure*}
\end{figure}

\newpage
\FloatBarrier

\subsection{Cross-Validation Fold Plots for Major Oxides}\label{subsec:cv_plots}
Expand All @@ -18,40 +23,41 @@ \subsection{Cross-Validation Fold Plots for Major Oxides}\label{subsec:cv_plots}
\foreach \oxide in {SiO2, TiO2, Al2O3, FeOT, MgO, CaO, Na2O, K2O} {
\subsubsection{\oxide}

\begin{figure*}
\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{images/\oxide/histogram_grid_plot.png}
\caption{Histogram and \gls{kde} of \ce{\oxide} distribution in each fold. The y-axis represents the count of samples per bin, and the x-axis represents \ce{\oxide} concentration. The notation in the legend indicates the amount of instances in the training/validation sets.}
\label{fig:histogram_grid_plot_\oxide}
\end{figure*}
\end{figure}

\begin{figure*}
\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{images/\oxide/histogram_kde_plot.png}
\caption{Combined Histogram and \gls{kde} of \ce{\oxide} distribution in each fold. The y-axis represents the count of samples per bin, and the x-axis represents \ce{\oxide} concentration. The notation in the legend indicates the amount of instances in the training/validation sets.}
\label{fig:histogram_kde_plot_\oxide}
\end{figure*}
\end{figure}

\begin{figure*}
\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{images/\oxide/original_and_post_fold.png}
\caption{Distribution of \ce{\oxide} concentrations before and after fold assignment. The left plot shows the original distribution of \ce{\oxide}, while the right plot shows the distribution with folds assigned, color-coded to indicate the different folds.}
\label{fig:original_and_post_fold_plot_\oxide}
\end{figure*}
\end{figure}

\begin{figure*}
\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{images/\oxide/distribution_plot.png}
\caption{Distribution of \ce{\oxide} concentrations across cross-validation folds, training set, test set, and the entire dataset. The mean and standard deviation statistics for each partition are indicated figure.}
\label{fig:distribution_plot_\oxide}
\end{figure*}
\end{figure}
}

\FloatBarrier

\subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_experiment_hyperparameters}
\begin{table*}
\begin{table}[!htb]
\centering
\caption{Explictly set hyperparameters for the \gls{pls}, \gls{svr}, ridge, \gls{lasso}, \gls{enet}, \gls{rf}, and \gls{etr} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used.}
\begin{tabular}{@{}llp{0.5\textwidth}@{}}
\toprule
\textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
Expand Down Expand Up @@ -101,14 +107,14 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
& \texttt{random\_state} & 42 \\
\midrule
\end{tabular}
\caption{Explictly set hyperparameters for the \gls{pls}, \gls{svr}, ridge, \gls{lasso}, \gls{enet}, \gls{rf}, and \gls{etr} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used.}
\label{tab:combined_hyperparameters}
\end{table*}
\end{table}

\FloatBarrier

\begin{table*}
\begin{table}[!htb]
\centering
\caption{Explictly set hyperparameters for the \gls{gbr} and \gls{xgboost} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used. The \gls{ngboost} model does not have any explicitly set hyperparameters.}
\begin{tabular}{@{}llp{0.5\textwidth}@{}}
\toprule
\textbf{Model} & \textbf{Hyperparameter} & \textbf{Value} \\
Expand Down Expand Up @@ -148,13 +154,12 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
& \texttt{eval\_metric} & rmse \\
\bottomrule
\end{tabular}
\caption{Explictly set hyperparameters for the \gls{gbr} and \gls{xgboost} models. When not explicitly set, the default hyperparameters provided by the libraries listed in Section~\ref{sec:experimental_setup} are used. The \gls{ngboost} model does not have any explicitly set hyperparameters.}
\label{tab:combined_hyperparameters}
\end{table*}
\end{table}

\FloatBarrier

\begin{table*}
\begin{table}[!htb]
\centering
\begin{tabular}{lll}
\toprule
Expand All @@ -172,12 +177,15 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
\multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
\bottomrule
\end{tabular}
\caption{Summary of the Neural Network Architecture}
\caption{Summary of the \gls{ann} architecture.}
\label{tab:nn_architecture}
\end{table*}
\end{table}

\FloatBarrier

\begin{table*}
\begin{table}[!htb]
\centering
\caption{Summary of the \gls{cnn} architecture.}
\begin{tabular}{lll}
\toprule
\textbf{Layer} & \textbf{Output Shape} & \textbf{Hyperparameter} \\ \midrule
Expand Down Expand Up @@ -208,12 +216,13 @@ \subsection{Initial Experiment: Model Hyperparameters}\label{subsec:initial_expe
\multicolumn{3}{l}{\textbf{Learning Rate:} 0.001} \\
\bottomrule
\end{tabular}
\caption{Summary of the Convolutional Neural Network Architecture}
\label{tab:cnn_architecture}
\end{table*}
\end{table}

\FloatBarrier

\clearpage

\subsection{Overview of best performing model configurations}\label{subsec:best_model_configurations}
\input{sections/appendix/tables/SiO2_overview.tex}
\input{sections/appendix/tables/TiO2_overview.tex}
Expand All @@ -223,108 +232,3 @@ \subsection{Overview of best performing model configurations}\label{subsec:best_
\input{sections/appendix/tables/CaO_overview.tex}
\input{sections/appendix/tables/Na2O_overview.tex}
\input{sections/appendix/tables/K2O_overview.tex}

\FloatBarrier

\subsection{Stacking Ensemble 1:1 Plots}\label{subsec:1_1_plots}
\begin{figure*}
\centering
\resizebox{0.75\textwidth}{!}{
\begin{tabular}{cc}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/SiO2.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/TiO2.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/Al2O3.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/FeOT.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/MgO.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/CaO.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/Na2O.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/elasticnet/K2O.png}
\end{subfigure}
\end{tabular}
}
\caption{One-to-one plots for the stacking ensemble model with the \gls{enet} as the meta-learner with $\alpha = 1$}
\label{fig:elasticnet_one_to_one}
\end{figure*}

\begin{figure*}
\centering
\resizebox{0.75\textwidth}{!}{
\begin{tabular}{cc}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/SiO2.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/TiO2.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/Al2O3.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/FeOT.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/MgO.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/CaO.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/Na2O.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/enetalpha01/K2O.png}
\end{subfigure}
\end{tabular}
}
\caption{One-to-one plots for the stacking ensemble model with the \gls{enet} as the meta-learner with $\alpha = 0.1$.}
\label{fig:enetalpha01_one_to_one}
\end{figure*}

\begin{figure*}
\centering
\resizebox{0.75\textwidth}{!}{
\begin{tabular}{cc}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/SiO2.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/TiO2.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/Al2O3.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/FeOT.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/MgO.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/CaO.png}
\end{subfigure} \\
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/Na2O.png}
\end{subfigure} & \hspace{3cm}
\begin{subfigure}{0.5\textwidth}
\includegraphics[width=\textwidth]{images/one_to_one/svr/K2O.png}
\end{subfigure}
\end{tabular}
}
\caption{One-to-one plots for the stacking ensemble model with the \gls{svr} as the meta-learner}
\label{fig:svr_one_to_one}
\end{figure*}
6 changes: 3 additions & 3 deletions report_thesis/src/sections/appendix/tables/Al2O3_overview.tex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
\begin{table*}
\begin{table}[!htb]
\centering
\caption{Overview of model types for \ce{Al2O3} oxide.}
\begin{tabular}{llllllll}
\toprule
\ce{Al2O3} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
Expand All @@ -16,6 +17,5 @@
& \texttt{random\_forest} & \texttt{power\_transformer} & \texttt{none} & \texttt{max\_abs\_scaler} & 2.302 & 2.295 & 2.111 \\
\bottomrule
\end{tabular}
\caption{Overview of model types for \ce{Al2O3} oxide}
\label{tab:Al2O3_overview}
\end{table*}
\end{table}
6 changes: 3 additions & 3 deletions report_thesis/src/sections/appendix/tables/CaO_overview.tex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
\begin{table*}
\begin{table}[!htb]
\centering
\caption{Overview of model types for \ce{CaO} oxide.}
\begin{tabular}{llllllll}
\toprule
\ce{CaO} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
Expand All @@ -16,6 +17,5 @@
& \texttt{lasso} & \texttt{power\_transformer} & \texttt{none} & \texttt{min\_max\_scaler} & 1.529 & 1.514 & 1.684 \\
\bottomrule
\end{tabular}
\caption{Overview of model types for \ce{CaO} oxide}
\label{tab:CaO_overview}
\end{table*}
\end{table}
6 changes: 3 additions & 3 deletions report_thesis/src/sections/appendix/tables/FeOT_overview.tex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
\begin{table*}
\begin{table}[!htb]
\centering
\caption{Overview of model types for \ce{FeO_T} oxide.}
\begin{tabular}{llllllll}
\toprule
\ce{FeO_T} & Model Type & Transformer Type & PCA Type & Scaler Type & \gls{rmsecv} & Std. dev. CV & \gls{rmsep} \\
Expand All @@ -16,6 +17,5 @@
& \texttt{random\_forest} & \texttt{quantile\_transformer} & \texttt{none} & \texttt{norm3\_scaler} & 3.079 & 3.044 & 2.018 \\
\bottomrule
\end{tabular}
\caption{Overview of model types for \ce{FeO_T} oxide}
\label{tab:FeOT_overview}
\end{table*}
\end{table}
Loading

0 comments on commit 61d178f

Please sign in to comment.