From 397b91b29683700bac101f504773ea967c11f56f Mon Sep 17 00:00:00 2001 From: Casey Kneale Date: Sat, 17 Oct 2020 14:46:25 -0400 Subject: [PATCH] updating compat --- .vscode/settings.json | 3 ++ Project.toml | 14 +++++----- README.md | 12 +++++--- docs/src/Demos/OtherPackages.md | 49 +++++++++++++++++++++++++++++++++ docs/src/index.md | 2 +- 5 files changed, 68 insertions(+), 12 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 docs/src/Demos/OtherPackages.md diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ef10df5 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "julia.environmentPath": "/home/caseykneale/Desktop/ChemometricsTools.jl" +} \ No newline at end of file diff --git a/Project.toml b/Project.toml index 65005d8..f66c356 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ChemometricsTools" uuid = "a9718f02-dbee-5ae5-ad0e-dfbd07fa387b" authors = ["caseykneale "] -version = "0.5.11" +version = "0.5.12" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" @@ -17,14 +17,14 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -CSV = "0.5.14" -Combinatorics = "0.7.0, 1" -DataFrames = "0.19.4, 0.20" +CSV = "^0.5.14" +Combinatorics = "^1" +DataFrames = "0.19.4, 0.20, 0.21" Distributions = "0.21.3, 0.22" -FFTW = "1.0" -RecipesBase = "0.7.0, 0.8" +FFTW = "~1.0" +RecipesBase = "^0.7.0, 0.8" StatsBase = "0.32.0" -julia = "1" +julia = "^1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index c6d1724..baa45ef 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://caseykneale.github.io/ChemometricsTools.jl/dev/) [![Build Status](https://travis-ci.org/caseykneale/ChemometricsTools.jl.svg?branch=master)](https://travis-ci.org/caseykneale/ChemometricsTools.jl) # ChemometricsTools.jl -This package contains a collection of tools to perform fundamental and advanced Chemometric analysis' in Julia. It is currently richer and more fundamental than any single free chemometrics package available in any other language. If you are uninformed as to what Chemometrics is; it could nonelegantly be described as the marriage between data science and chemistry. Traditionally it is the symbiosis of applied linear algebra/statistics which is disciplined by the physics and meaning of chemical measurements. This is somewhat orthogonal to most specializations of machine learning where "add more layers" is the modus operandi. Sometimes chemometricians also weigh the pros and cons of black box modelling and break out pure machine learning methods - so some of those techniques are in this package. +This package contains a collection of tools to perform fundamental and advanced Chemometric analysis' in Julia. It is currently richer than any other free chemometrics package available in any other language. If you are uninformed as to what Chemometrics is; it could nonelegantly be described as the marriage between data science and chemistry. Traditionally it is the symbiosis of applied linear algebra/statistics which is disciplined by the physics and meaning of chemical measurements. This is somewhat orthogonal to most specializations of machine learning where "add more layers" is the modus operandi. Sometimes chemometricians also weigh the pros and cons of black box modelling and break out pure machine learning methods - so some of those techniques are in this package. ## Tutorials/Demonstrations: - [Transforms](https://caseykneale.github.io/ChemometricsTools.jl/dev/Demos/Transforms/) @@ -54,12 +54,16 @@ methods for Tucker decomposition (HOSVD, and HOOI) have been included. Some prep This package has tools for specialized fields of analysis'. For instance, fractional derivatives for the electrochemists (and the adventurous), a handful of smoothing methods for spectroscopists, curve resolution (unimodal and nonnegativity constraints available) for forensics, process fault detection methods, etc. There are certainly plans for other tools for analyzing chemical data that packages in other languages have seemingly left out. Stay tuned. ## Where's the Data? -Right now I don't have rights to provide much data; but the 2002 International Diffuse Reflectance Conference Pharmaceutical NIR, iris, Tecator aka 'meat', and ball gear fault detection (NASA) dataset are [included](https://github.com/caseykneale/ChemometricsTools.jl/tree/master/data). +Please check out [ChemometricsData.jl](https://github.com/caseykneale/ChemometricsData.jl) for access to more publicly available datasets. -I'd love for a collaborator to contribute some: spectra, chromatograms, etc. Please reach out to me if you wish to collaborate/contribute. In the mean time you can load in your own datasets using the Julia ecosystem (XLSX.jl, CSV.jl, JSON.jl, MATLAB.jl, etc). +Right now the 2002 International Diffuse Reflectance Conference Pharmaceutical NIR, iris, Tecator aka 'meat', and ball gear fault detection (NASA) dataset are [included](https://github.com/caseykneale/ChemometricsTools.jl/tree/master/data) in this package. But, this will be factored out eventually into ChemometricsData.jl. + +I'd love for a collaborator to contribute some: spectra, chromatograms, etc. Please reach out to me if you wish to collaborate/contribute. In the mean time you can load in your own datasets using the full extent of Julia ecosystem (XLSX.jl, CSV.jl, JSON.jl, MATLAB.jl, LibPQ.jl, Feather.jl, Arrow.jl, etc). ## What about Time Series? Cluster modeling? -Well, I'd love to hammer in some time series methods. That was originally part of the plan. Then I realized [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl) already has the essentials for online learning covered. Surely many are contemplating packages with time series as a focus. Similarly, if you want clustering methods, just install [Clustering.jl](https://github.com/JuliaStats/Clustering.jl). I may add a few supportive odds and ends in here (or contribute to the packages directly) but really, most of the Julia 1.0+ ecosystem is really reliable, well made, and community supported. +Well, I'd love to hammer in some time series methods. That was originally part of the plan. Then I realized [OnlineStats.jl](https://github.com/joshday/OnlineStats.jl) already has the essentials for online learning covered, and a there are many efforts for actual time series((TimeSeries.jl)[https://github.com/JuliaStats/TimeSeries.jl]) modelling in the works. + +Similarly, if clustering methods are important to you, check out [Clustering.jl](https://github.com/JuliaStats/Clustering.jl). I may add a few supportive odds and ends in here (or contribute to the packages directly) but really, most of the Julia 1.0+ ecosystem is really reliable, well made, and community supported. ## ToDo: - Clean up. diff --git a/docs/src/Demos/OtherPackages.md b/docs/src/Demos/OtherPackages.md new file mode 100644 index 0000000..2c72f0b --- /dev/null +++ b/docs/src/Demos/OtherPackages.md @@ -0,0 +1,49 @@ +# Other Packages: +So you know what you're doing, you're not one of those Friday night chemometricians Brereton talked about, and you want to compare some methods available in ChemometricsTools.jl. Great! The nice thing about Julia is, packages tend to work with one another with zero effort. To demonstrate this I made a little tutorial using (Turing.jl)[https://turing.ml/dev/] and (ChemometricsData.jl)[https://github.com/caseykneale/ChemometricsData.jl] for a very basic incomplete analysis of some well known bayesian regression methods. Let's get started. + + +## Lets load in some data +```julia +using Turing, StatsPlots, Plots, Statistics +using DataFrames, ChemometricsData + +println( ChemometricsData.search("corn") ) +corn_data = ChemometricsData.load("Cargill_Corn") +X = Matrix(corn_data["m5_spectra.csv"]) + +xaxis = 1100:2:2498#nm + +plot( X', title = "Cargill Corn M5 Spec", xlab = "Wavelength (nm)", ylab = "Absorbance", legend = false, + xticks = (1:50:length(xaxis), xaxis[1:50:end]) ) +``` +![CornEDA](https://raw.githubusercontent.com/caseykneale/ChemometricsTools/master/images/BayesDemo/CornSpectra.png) + +Grab our property values, +```julia +Y = corn_data["property_values.csv"][!,:Moisture] +``` + +Now let's center and scale our X and Y values to keep our regression methods happy + +```julia +train, test = 1:35, 36:80 +X_train, X_test = X_processed[train,:], X_processed[test,:] +μx,σx = mean(X_train, dims = 1), std(X_train, dims = 1) +X_train = (X_train .- μx) ./ σx +X_test = (X_test .- μx) ./ σx + +Y_train, Y_test = Y[train,:], Y[test,:] +μy,σy = mean(Y_train),std(Y_train) +Y_train = (Y_train .- μy) ./ σy +Y_test = (Y_test .- μy) ./ σy +``` + + +![20folds](https://raw.githubusercontent.com/caseykneale/ChemometricsTools/master/images/CV.png) + +That's great right? but, hey that was kind of slow. Knowing what we know about ALS based models, we can do the same operation in linear time with respect to latent factors by computing the most latent variables first and only recomputing the regression coefficients. An example of this is below, + +```julia + +``` +This approach is ~5 times faster on a single core( < 2 seconds), pours through 7Gb less data, and makes 1/5th the allocations (on this dataset at least). If you wanted you could distribute the inner loop (using Distributed.jl) and see drastic speed ups! diff --git a/docs/src/index.md b/docs/src/index.md index 0fed777..db53102 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -11,7 +11,7 @@ Pkg.add("ChemometricsTools") ``` ## Support: -This package was written in [Julia 1.0.3](https://julialang.org/) but it runs fine on 1.1, 1.2 and later releases. That's the beauty of from scratch code with minimal dependencies. +This package was written in [Julia 1.0.3](https://julialang.org/) but it runs fine on 1.4, 1.5 and later releases. That's the beauty of from scratch code with minimal dependencies. ## Ethos Dependencies: Only base libraries (Distributions, LinearAlgebra, StatsBase, Statistics, Plots) etc will be required. This is for longevity, and to provide a fast precompilation time. As wonderful as it is that other packages exist to do some of the internal operations this one needs, we won't have to worry about a breaking change made by an external author working out the kinks in a separate package. I want this to be long-term reliable without much upkeep. I'm a busy guy working a day job; I write this to warm-up before work, and unwind afterwards.