From a8036853c389a50ed31690ed49331c78a54bc77a Mon Sep 17 00:00:00 2001 From: Joseph Abbott Date: Tue, 23 May 2023 13:49:51 +0200 Subject: [PATCH 01/12] stub script sample selection with rascaline --- examples/sample_selection/sample_selection_rascaline.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/sample_selection/sample_selection_rascaline.py diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py new file mode 100644 index 00000000..e69de29b From 7c681e1f86e968eeac6dc8e7ff561b503755c971 Mon Sep 17 00:00:00 2001 From: hannatuerk Date: Tue, 23 May 2023 15:11:26 +0200 Subject: [PATCH 02/12] rascaline soap powerspectrum - first part --- .../sample_selection_rascaline.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py index e69de29b..f89a6c72 100644 --- a/examples/sample_selection/sample_selection_rascaline.py +++ b/examples/sample_selection/sample_selection_rascaline.py @@ -0,0 +1,66 @@ + +""" +Sample Selection with FPS and CUR (rascaline and equisolve) +============================================= + +.. start-body + +In this tutorial we generate descriptors using rascaline, then select a subset +of structures using both the farthest-point sampling (FPS) and CUR algorithms +with equisolve. +""" +# %% +# First, import all the necessary packages +import ase.io # we need to add ASE to the requirements.txt +import equistore +import equisolve +import rascaline + + +# %% +# Load molecular data +# ------------------- +# +# Load 100 example BTO structures from file, reading them using +# `ASE `_. + +# Load a subset of structures of the example dataset +n_frames = 250 +frames = ase.io.read("./dataset/input-fps.xyz", f":{n_frames}", format="extxyz") + + +# %% +# Compute SOAP descriptor using librascal +# --------------------------------------- +# +# First, define the rascaline hyperparameters used to compute SOAP. + +# Define SOAP hyperparameters in rascaline format +soap_hypers = { + "cutoff": 3.0, # Angstrom + "max_radial": 8, # Exclusive + "max_angular": 5, # Inclusive + "atomic_gaussian_width": 0.3, + "radial_basis": {"Gto": {}}, + "cutoff_function": {"ShiftedCosine": {"width": 0.5}}, + "center_atom_weight": 1.0, +} + + +# Generate a SOAP spherical expansion + +calculator = rascaline.SoapPowerSpectrum(**soap_hypers) +soap = calculator.compute(frames) + +print(soap) +# %% +# Perform structure (i.e. sample) selection +# ----------------------------------------- +# +# Using FPS and CUR algorithms implemented in equisolve using scikit-matter, select a subset of +# the structures. equisolve sample selection wraps scikit-matter sample selection algorithms +# but handles them in the equistore tensor format. +# +# For more info on the functions: `skmatter +# `_ + From cf22c287cd68cab6ae10be9ede30d06cfdff3c0b Mon Sep 17 00:00:00 2001 From: hannatuerk Date: Wed, 24 May 2023 14:41:11 +0200 Subject: [PATCH 03/12] fps with equisolve --- .../sample_selection/sample_selection_rascaline.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py index f89a6c72..5b03ed6c 100644 --- a/examples/sample_selection/sample_selection_rascaline.py +++ b/examples/sample_selection/sample_selection_rascaline.py @@ -16,6 +16,7 @@ import equisolve import rascaline +from equisolve.numpy import sample_selection # %% # Load molecular data @@ -64,3 +65,15 @@ # For more info on the functions: `skmatter # `_ +# Define the number of structures to select using FPS/CUR +n_structures = 5 + +#FPS sample selection +selector=sample_selection.FPS(n_to_select=n_structures, initialize='random').fit(soap) + +struct_fps_idxs=selector.support[0].samples['structure'] #samples selected in of first block? + +print("Structure indices obtained with FPS ", struct_fps_idxs) + + + From e87c3c5842dfc97fbef223a289f06ce61412677e Mon Sep 17 00:00:00 2001 From: Joseph Abbott Date: Wed, 14 Jun 2023 19:12:21 +0200 Subject: [PATCH 04/12] rough draft --- .../sample_selection_rascaline.py | 136 ++++++++++++++---- 1 file changed, 112 insertions(+), 24 deletions(-) diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py index 5b03ed6c..44483a28 100644 --- a/examples/sample_selection/sample_selection_rascaline.py +++ b/examples/sample_selection/sample_selection_rascaline.py @@ -1,19 +1,31 @@ - """ Sample Selection with FPS and CUR (rascaline and equisolve) ============================================= .. start-body -In this tutorial we generate descriptors using rascaline, then select a subset -of structures using both the farthest-point sampling (FPS) and CUR algorithms -with equisolve. +TODO: +- Fill out context + motivation here +- Does structure selection for blocks of different chemical species make sense? +- Chemiscope +- Plot a histogram (or just report numbers) of the chemical species selected +when using a single block descriptor vs the equal number of samples for each +species when using a multi-block descriptor. + +# Motivations + +1. Selecting reference atomic envs for kernel construction +2. Selecting diverse samples for training: + * Diverse structures, i.e. configurations from an MD trajectory + to compute with high fidelity DFT and used as the training data. + * Atomic environments - i.e. for selecting atomic environments used as the + training data. Used when the target property is locally decomposed (i.e.), but + not appropriate for globally defined properties (i.e. total energies). """ # %% # First, import all the necessary packages import ase.io # we need to add ASE to the requirements.txt import equistore -import equisolve import rascaline from equisolve.numpy import sample_selection @@ -31,10 +43,11 @@ # %% -# Compute SOAP descriptor using librascal +# Compute SOAP descriptor using rascaline # --------------------------------------- # # First, define the rascaline hyperparameters used to compute SOAP. +# Then the SOAP descriptor can be computed using rascaline. # Define SOAP hyperparameters in rascaline format soap_hypers = { @@ -47,33 +60,108 @@ "center_atom_weight": 1.0, } - # Generate a SOAP spherical expansion - calculator = rascaline.SoapPowerSpectrum(**soap_hypers) -soap = calculator.compute(frames) +soap_atomic = calculator.compute(frames) -print(soap) # %% -# Perform structure (i.e. sample) selection -# ----------------------------------------- +# Perform atomic environment (i.e. sample) selection +# -------------------------------------------------- + +# Using FPS and CUR algorithms, we can perform selection of atomic environments. +# These are implemented in equisolve, which provides a wrapper around +# scikit-matter to allow for interfacing with data stored in the equistore +# format. # -# Using FPS and CUR algorithms implemented in equisolve using scikit-matter, select a subset of -# the structures. equisolve sample selection wraps scikit-matter sample selection algorithms -# but handles them in the equistore tensor format. +# Suppose we want to select the 10 most diverse environments for each chemical +# species. # -# For more info on the functions: `skmatter -# `_ - -# Define the number of structures to select using FPS/CUR -n_structures = 5 +# First, we can use the `keys_to_properties` operation in equistore to move the +# neighbour species indices to the properties of the TensorBlocks. The resulting +# descriptor will be a TensorMap comprised of three blocks, one for each +# chemical species, where the chemical species indices are solely present in the +# keys. + +# Move the neighbour species indices from keys to properties. +soap_atomic = soap_atomic.keys_to_properties( + keys_to_move=["species_neighbor_1", "species_neighbor_2"] +) +print(soap_atomic) +print(soap_atomic.block(0)) + +# %% Now let's perform sample selection on the atomic environments. We want to +# select 10 atomic environments for each chemical species. + +# Define the number of structures *per block* to select using FPS +n_envs = 10 + +# FPS sample selection +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + soap_atomic +) + +# Print the selected envs for each block +print("atomic envs selected with FPS:\n") +for key, block in selector_atomic_fps.support: + print("species_center:", key, "(struct_idx, atom_idx)", block.samples) -#FPS sample selection -selector=sample_selection.FPS(n_to_select=n_structures, initialize='random').fit(soap) +# %% +# Perform structure (i.e. sample) selection with FPS/CUR +# --------------------------------------------------------- +# +# Instead of atomic environments, one can also select diverse structures. We can +# use the `sum_over_samples` operation in equistore to define features in the +# structural basis instead of the atomic basis. This is done by summing over the +# atomic environments, labeled by the 'center' index in the samples of the +# TensorMap. +# +# Alternatively, one could use the `mean_over_samples` operation, depending on +# the specific inhomogeneity of the size of the structures in the training set. -struct_fps_idxs=selector.support[0].samples['structure'] #samples selected in of first block? +# Sum over atomic environments. +soap_struct = equistore.sum_over_samples(soap_atomic, "center") +print(soap_struct) +print(soap_struct.block(0)) -print("Structure indices obtained with FPS ", struct_fps_idxs) +# Define the number of structures to select *per block* using FPS +n_structures = 5 +# FPS structure selection +selector_struct_fps = sample_selection.FPS( + n_to_select=n_structures, initialize="random" +).fit(soap_struct) +print("structures selected with FPS:", selector_struct_fps.support.block(0).samples) +# FPS structure selection +selector_struct_cur = sample_selection.FPS(n_to_select=n_structures).fit(soap_struct) +print("structures selected with CUR:", selector_struct_cur.support.block(0).samples) +# %% +# Selecting from a combined pool of atomic environments +# ----------------------------------------------------- +# +# One can also select from a combined pool of atomic environments and +# structures, instead of selecting an equal number of atomic environments for +# each chemical species. In this case, we can move all the keys to properties +# such that our descriptor is a TensorMap consisting of a single block. Upon +# sample selection, the most diverse atomic environments will be selected, +# regardless of their chemical species. + +# Using the original SOAP descriptor, move all keys to properties. +soap_atomic_single_block = soap_atomic.keys_to_properties( + keys_to_move=["species_center", "species_neighbor_1", "species_neighbor_2"] +) +print(soap_atomic_single_block) +print(soap_atomic_single_block.block(0)) # There is only one block now! + +# Define the number of structures to select using FPS +n_envs = 10 + +# FPS sample selection +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + soap_atomic_single_block +) +print( + "atomic envs selected with FPS (struct_idx, atom_idx):\n", + selector_atomic_fps.support.block(0).samples, +) From 22ad4c9a050d070e22866dc932af01a980b808a7 Mon Sep 17 00:00:00 2001 From: HannaTuerk Date: Tue, 17 Oct 2023 13:57:55 +0200 Subject: [PATCH 05/12] update to new name of metatensor --- .../sample_selection_rascaline.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py index 44483a28..0b8903fb 100644 --- a/examples/sample_selection/sample_selection_rascaline.py +++ b/examples/sample_selection/sample_selection_rascaline.py @@ -25,7 +25,7 @@ # %% # First, import all the necessary packages import ase.io # we need to add ASE to the requirements.txt -import equistore +import metatensor import rascaline from equisolve.numpy import sample_selection @@ -70,13 +70,13 @@ # Using FPS and CUR algorithms, we can perform selection of atomic environments. # These are implemented in equisolve, which provides a wrapper around -# scikit-matter to allow for interfacing with data stored in the equistore +# scikit-matter to allow for interfacing with data stored in the metatensor # format. # # Suppose we want to select the 10 most diverse environments for each chemical # species. # -# First, we can use the `keys_to_properties` operation in equistore to move the +# First, we can use the `keys_to_properties` operation in metatensor to move the # neighbour species indices to the properties of the TensorBlocks. The resulting # descriptor will be a TensorMap comprised of three blocks, one for each # chemical species, where the chemical species indices are solely present in the @@ -100,9 +100,11 @@ soap_atomic ) +#selector=sample_selection.FPS(n_to_select=n_envs, initialize='random').fit(soap_atomic) + # Print the selected envs for each block print("atomic envs selected with FPS:\n") -for key, block in selector_atomic_fps.support: +for key, block in selector_atomic_fps.support.items(): print("species_center:", key, "(struct_idx, atom_idx)", block.samples) # %% @@ -110,7 +112,7 @@ # --------------------------------------------------------- # # Instead of atomic environments, one can also select diverse structures. We can -# use the `sum_over_samples` operation in equistore to define features in the +# use the `sum_over_samples` operation in metatensor to define features in the # structural basis instead of the atomic basis. This is done by summing over the # atomic environments, labeled by the 'center' index in the samples of the # TensorMap. @@ -119,7 +121,7 @@ # the specific inhomogeneity of the size of the structures in the training set. # Sum over atomic environments. -soap_struct = equistore.sum_over_samples(soap_atomic, "center") +soap_struct = metatensor.sum_over_samples(soap_atomic, "center") print(soap_struct) print(soap_struct.block(0)) @@ -147,9 +149,12 @@ # sample selection, the most diverse atomic environments will be selected, # regardless of their chemical species. +print('keys',soap_atomic.keys) +print('blocks',soap_atomic[0]) + # Using the original SOAP descriptor, move all keys to properties. -soap_atomic_single_block = soap_atomic.keys_to_properties( - keys_to_move=["species_center", "species_neighbor_1", "species_neighbor_2"] +soap_atomic_single_block = soap_atomic.keys_to_samples( + keys_to_move=["species_center"]#, "species_neighbor_1", "species_neighbor_2"] ) print(soap_atomic_single_block) print(soap_atomic_single_block.block(0)) # There is only one block now! From eef9716b3850d8cbb43eff115eeb6a2d32c6bc86 Mon Sep 17 00:00:00 2001 From: HannaTuerk Date: Tue, 17 Oct 2023 14:54:36 +0200 Subject: [PATCH 06/12] updated documentation and output statements --- .../sample_selection_rascaline.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/examples/sample_selection/sample_selection_rascaline.py b/examples/sample_selection/sample_selection_rascaline.py index 0b8903fb..8f40c142 100644 --- a/examples/sample_selection/sample_selection_rascaline.py +++ b/examples/sample_selection/sample_selection_rascaline.py @@ -51,7 +51,7 @@ # Define SOAP hyperparameters in rascaline format soap_hypers = { - "cutoff": 3.0, # Angstrom + "cutoff": 5.0, # Angstrom "max_radial": 8, # Exclusive "max_angular": 5, # Inclusive "atomic_gaussian_width": 0.3, @@ -100,12 +100,10 @@ soap_atomic ) -#selector=sample_selection.FPS(n_to_select=n_envs, initialize='random').fit(soap_atomic) - # Print the selected envs for each block print("atomic envs selected with FPS:\n") for key, block in selector_atomic_fps.support.items(): - print("species_center:", key, "(struct_idx, atom_idx)", block.samples) + print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) # %% # Perform structure (i.e. sample) selection with FPS/CUR @@ -132,11 +130,11 @@ selector_struct_fps = sample_selection.FPS( n_to_select=n_structures, initialize="random" ).fit(soap_struct) -print("structures selected with FPS:", selector_struct_fps.support.block(0).samples) +print("structures selected with FPS:\n", selector_struct_fps.support.block(0).samples.values) -# FPS structure selection -selector_struct_cur = sample_selection.FPS(n_to_select=n_structures).fit(soap_struct) -print("structures selected with CUR:", selector_struct_cur.support.block(0).samples) +# CUR structure selection +selector_struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(soap_struct) +print("structures selected with CUR:\n", selector_struct_cur.support.block(0).samples.values) # %% # Selecting from a combined pool of atomic environments @@ -144,17 +142,18 @@ # # One can also select from a combined pool of atomic environments and # structures, instead of selecting an equal number of atomic environments for -# each chemical species. In this case, we can move all the keys to properties +# each chemical species. In this case, we can move the 'species_center' key to samples # such that our descriptor is a TensorMap consisting of a single block. Upon # sample selection, the most diverse atomic environments will be selected, # regardless of their chemical species. print('keys',soap_atomic.keys) print('blocks',soap_atomic[0]) +print('samples in first block',soap_atomic[0].samples) # Using the original SOAP descriptor, move all keys to properties. soap_atomic_single_block = soap_atomic.keys_to_samples( - keys_to_move=["species_center"]#, "species_neighbor_1", "species_neighbor_2"] + keys_to_move=["species_center"] ) print(soap_atomic_single_block) print(soap_atomic_single_block.block(0)) # There is only one block now! @@ -167,6 +166,6 @@ soap_atomic_single_block ) print( - "atomic envs selected with FPS (struct_idx, atom_idx):\n", - selector_atomic_fps.support.block(0).samples, + "atomic envs selected with FPS: \n (struct_idx, atom_idx, species_center) \n", + selector_atomic_fps.support.block(0).samples.values, ) From 3cb291d2dc4e6796c33534e22fb324e8f05e33b4 Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 15:03:17 +0100 Subject: [PATCH 07/12] Sample selection updated to equisolve --- examples/sample-selection/sample-selection.py | 394 ++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 examples/sample-selection/sample-selection.py diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py new file mode 100644 index 00000000..50d6f24a --- /dev/null +++ b/examples/sample-selection/sample-selection.py @@ -0,0 +1,394 @@ +""" +Sample and Feature Selection with FPS and CUR +============================================= + +.. start-body + +In this tutorial we generate descriptors using rascaline, then select a subset +of structures using both the farthest-point sampling (FPS) and CUR algorithms +implemented in scikit-matter. Finally, we also generate a selection of +the most important features using the same techniques. + +First, import all the necessary packages +""" + +# %% + +import ase.io +import chemiscope +import numpy as np +from matplotlib import pyplot as plt +from metatensor import sum_over_samples +import metatensor +from rascaline import SoapPowerSpectrum +from sklearn.decomposition import PCA + +from equisolve.numpy import sample_selection, feature_selection + + +# %% +# Load molecular data +# ------------------- +# +# Load 500 example BTO structures from file, reading them using +# `ASE `_. + +# Load a subset of :download:`structures ` of the example dataset +n_frames = 500 +frames = ase.io.read("input-fps.xyz", f":{n_frames}", format="extxyz") + +# %% +# Compute SOAP descriptors using rascaline +# ---------------------------------------- +# +# First, define the rascaline hyperparameters used to compute SOAP. + + +# rascaline hyperparameters +hypers = { + "cutoff": 6.0, + "max_radial": 8, + "max_angular": 6, + "atomic_gaussian_width": 0.3, + "cutoff_function": {"ShiftedCosine": {"width": 0.5}}, + "radial_basis": {"Gto": {"accuracy": 1e-6}}, + "radial_scaling": {"Willatt2018": {"exponent": 4, "rate": 1, "scale": 3.5}}, + "center_atom_weight": 1.0, +} + +# Generate a SOAP power spectrum +calculator = SoapPowerSpectrum(**hypers) +rho2i = calculator.compute(frames) + + + +# Makes a dense block +atom_soap = rho2i.keys_to_properties( + ["species_neighbor_1", "species_neighbor_2"] +) + +atom_soap_single_block = atom_soap.keys_to_samples( + keys_to_move=["species_center"] +) + +#print(atom_soap_single_block) +#print(atom_soap_single_block.block(0)) # There is only one block now! + +# Sum over atomic centers to compute structure features +struct_soap = sum_over_samples(atom_soap_single_block, sample_names=["center", "species_center"]) + + +print("atom feature descriptor shape:", atom_soap.block(0).values.shape) +print("atom feature descriptor (all in one block) shape:", atom_soap_single_block.block(0).values.shape) +print("structure feature descriptor shape:", struct_soap.block(0).values.shape) + + +# %% +# Perform atomic environment (i.e. sample) selection +# ----------------------------------------- +# +# Using FPS and CUR algorithms, we can perform selection of atomic environments. +# These are implemented in equisolve, which provides a wrapper around +# scikit-matter to allow for interfacing with data stored in the metatensor +# format. +# +# Suppose we want to select the 10 most diverse environments for each chemical +# species. +# +# First, we can use the `keys_to_properties` operation in metatensor to move the +# neighbour species indices to the properties of the TensorBlocks. The resulting +# descriptor will be a TensorMap comprised of three blocks, one for each +# chemical species, where the chemical species indices are solely present in the +# keys. + + +print('----Atomic environment selection-----') +# Define the number of structures to select using FPS/CUR +n_envs = 25 + +#atom_soap = atom_soap.keys_to_properties( +# keys_to_move=["species_neighbor_1", "species_neighbor_2"] +#) +print(atom_soap) +print(atom_soap.block(0)) + +# %% Now let's perform sample selection on the atomic environments. We want to +# select 10 atomic environments for each chemical species. + +# Define the number of structures *per block* to select using FPS +n_envs = 10 + +# FPS sample selection +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + atom_soap +) + +# Print the selected envs for each block +print("atomic envs selected with FPS:\n") +for key, block in selector_atomic_fps.support.items(): + print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) + +#selector_atomic_cur = sample_selection.CUR(n_to_select=n_envs).fit(atom_soap) +## Print the selected envs for each block +#print("atomic envs selected with CUR:\n") +#for key, block in selector_atomic_cur.support.items(): +# print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) + + +# %% +# Selecting from a combined pool of atomic environments +# ----------------------------------------------------- +# +# One can also select from a combined pool of atomic environments and +# structures, instead of selecting an equal number of atomic environments for +# each chemical species. In this case, we can move the 'species_center' key to samples +# such that our descriptor is a TensorMap consisting of a single block. Upon +# sample selection, the most diverse atomic environments will be selected, +# regardless of their chemical species. +print('----All atomic environment selection-----') + +print('keys',atom_soap.keys) +print('blocks',atom_soap[0]) +print('samples in first block',atom_soap[0].samples) + +# Using the original SOAP descriptor, move all keys to properties. + + +# Define the number of structures to select using FPS +n_envs = 10 + +# FPS sample selection +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + atom_soap_single_block +) +print( + "atomic envs selected with FPS: \n (struct_idx, atom_idx, species_center) \n", + selector_atomic_fps.support.block(0).samples.values, +) + + + +# %% +# Perform structure (i.e. sample) selection with FPS/CUR +# --------------------------------------------------------- +# +# Instead of atomic environments, one can also select diverse structures. We can +# use the `sum_over_samples` operation in metatensor to define features in the +# structural basis instead of the atomic basis. This is done by summing over the +# atomic environments, labeled by the 'center' index in the samples of the +# TensorMap. +# +# Alternatively, one could use the `mean_over_samples` operation, depending on +# the specific inhomogeneity of the size of the structures in the training set. + +## Sum over atomic environments. #TODO MEAN? +#struct_soap = sum_over_samples(atom_soap, "center") +#print(struct_soap) +#print(struct_soap.block(0)) +print('----Structure selection-----') + +# Define the number of structures to select *per block* using FPS +n_structures = 10 + +# FPS structure selection +selector_struct_fps = sample_selection.FPS( + n_to_select=n_structures, initialize="random" +).fit(struct_soap) +struct_fps_idxs = selector_struct_fps.support.block(0).samples.values.flatten() + +print("structures selected with FPS:\n", struct_fps_idxs) +#print("Structure indices obtained with FPS ", struct_fps_idxs) + +# CUR structure selection +selector_struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap) +struct_cur_idxs = selector_struct_cur.support.block(0).samples.values.flatten() +print("structures selected with CUR:\n", struct_cur_idxs) +#print("Structure indices obtained with CUR ", struct_cur_idxs) + + +#### FPS sample selection +###struct_fps = sample_selection.FPS(n_to_select=n_structures, initialize="random").fit( +### struct_soap +###) +###struct_fps_idxs = struct_fps.selected_idx_ +### +#### CUR sample selection +###struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap) +###struct_cur_idxs = struct_cur.selected_idx_ +### +###print("Structure indices obtained with FPS ", struct_fps_idxs) +###print("Structure indices obtained with CUR ", struct_cur_idxs) +### + +# Slice structure descriptor along axis 0 to contain only the selected structures +struct_soap_fps = struct_soap.block(0).values[struct_fps_idxs,:] +struct_soap_cur = struct_soap.block(0).values[struct_cur_idxs, :] +assert struct_soap_fps.shape == struct_soap_cur.shape + +print("Structure descriptor shape before selection ", struct_soap.block(0).values.shape) +print("Structure descriptor shape after selection (FPS)", struct_soap_fps.shape) +print("Structure descriptor shape after selection (CUR)", struct_soap_cur.shape) + + + +# %% +# Visualize selected structures +# ----------------------------- +# +# sklearn can be used to perform PCA dimensionality reduction on the SOAP +# descriptors. The resulting PC coordinates can be used to visualize the the +# data alongside their structures in a chemiscope widget. +# +# Note: chemiscope widgets are not currently integrated into our sphinx gallery: +# coming soon. + + +# Generate a structure PCA +struct_soap_pca = PCA(n_components=2).fit_transform(struct_soap.block(0).values) +assert struct_soap_pca.shape == (n_frames, 2) + + +# %% +# Plot the PCA map +# ~~~~~~~~~~~~~~~~ +# +# Notice how the selected points avoid the densely-sampled area, and cover +# the periphery of the dataset + +# Matplotlib plot +fig, ax = plt.subplots(1, 1, figsize=(6, 4)) +scatter = ax.scatter(struct_soap_pca[:, 0], struct_soap_pca[:, 1], c="red") +ax.plot( + struct_soap_pca[struct_cur_idxs, 0], + struct_soap_pca[struct_cur_idxs, 1], + "kx", + label="CUR selection", +) +ax.plot( + struct_soap_pca[struct_fps_idxs, 0], + struct_soap_pca[struct_fps_idxs, 1], + "ko", + fillstyle="none", + label="FPS selection", +) +ax.set_xlabel("PCA[1]") +ax.set_ylabel("PCA[2]") +ax.legend() +fig.show() + + +# %% +# Creates a chemiscope viewer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Interactive viewer (only works in notebooks) + +# Selected level +selection_levels = [] +for i in range(len(frames)): + level = 0 + if i in struct_cur_idxs: + level += 1 + if i in struct_fps_idxs: + level += 2 + if level == 0: + level = "Not selected" + elif level == 1: + level = "CUR" + elif level == 2: + level = "FPS" + else: + level = "FPS+CUR" + selection_levels.append(level) + +properties = chemiscope.extract_properties(frames) + +properties.update( + { + "PC1": struct_soap_pca[:, 0], + "PC2": struct_soap_pca[:, 1], + "selection": np.array(selection_levels), + } +) + +#print(properties) + +# Display with chemiscope. This currently does not work - as raised in issue #8 +# https://github.com/lab-cosmo/software-cookbook/issues/8 +widget = chemiscope.show( + frames, + properties=properties, + settings={ + "map": { + "x": {"property": "PC1"}, + "y": {"property": "PC2"}, + "color": {"property": "energy"}, + "symbol": "selection", + "size": {"factor": 50}, + }, + "structure": [{"unitCell": True}], + }, +) + +if chemiscope.jupyter._is_running_in_notebook(): + from IPython.display import display + + display(widget) +else: + widget.save("sample-selection.json.gz") + + +# %% +# Perform feature selection +# ------------------------- +# +# Now perform feature selection. In this example we will go back to using the +# descriptor decomposed into atomic environments, as opposed to the one +# decomposed into structure environments, but only use FPS for brevity. +print('----Feature selection-----') + +# Define the number of features to select +n_features = 200 + +# FPS feature selection +feat_fps = feature_selection.FPS(n_to_select=n_features, initialize="random").fit( + atom_soap_single_block +) + +# Slice atomic descriptor along axis 1 to contain only the selected features +#atom_soap_single_block_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] +atom_soap_single_block_fps=metatensor.slice(atom_soap_single_block, axis="properties", labels=feat_fps.support.block(0).properties) + +print("atomic descriptor shape before selection ", atom_soap_single_block.block(0).values.shape) +print("atomic descriptor shape after selection ", atom_soap_single_block_fps.block(0).values.shape) + +# %% + +# %% +# Perform feature selection (skmatter) +# ------------------------- +# +# Now perform feature selection. In this example we will go back to using the +# descriptor decomposed into atomic environments, as opposed to the one +# decomposed into structure environments, but only use FPS for brevity. +from skmatter import feature_selection +print('----Feature selection (skmatter)-----') + +# Define the number of features to select +n_features = 200 + +# FPS feature selection +feat_fps = feature_selection.FPS(n_to_select=n_features, initialize="random").fit( + atom_soap_single_block.block(0).values +) +feat_fps_idxs = feat_fps.selected_idx_ + +print("Feature indices obtained with FPS ", feat_fps_idxs) + +# Slice atomic descriptor along axis 1 to contain only the selected features +atom_dscrptr_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] + +print("atomic descriptor shape before selection ", atom_soap_single_block.block(0).values.shape) +print("atomic descriptor shape after selection ", atom_dscrptr_fps.shape) + +# %% From c77f4bd3b5b93a19fe6cf959602f194aa1feb08e Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 15:33:02 +0100 Subject: [PATCH 08/12] Cleanup sample selection with equisolve --- examples/sample-selection/sample-selection.py | 104 +----------------- 1 file changed, 5 insertions(+), 99 deletions(-) diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py index b8f8fb20..c4581d3e 100644 --- a/examples/sample-selection/sample-selection.py +++ b/examples/sample-selection/sample-selection.py @@ -18,19 +18,12 @@ import chemiscope import numpy as np from matplotlib import pyplot as plt -<<<<<<< HEAD from metatensor import sum_over_samples import metatensor from rascaline import SoapPowerSpectrum from sklearn.decomposition import PCA from equisolve.numpy import sample_selection, feature_selection -======= -from metatensor import mean_over_samples -from rascaline import SoapPowerSpectrum -from sklearn.decomposition import PCA -from skmatter import feature_selection, sample_selection ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f # %% @@ -66,7 +59,6 @@ # Generate a SOAP power spectrum calculator = SoapPowerSpectrum(**hypers) rho2i = calculator.compute(frames) -<<<<<<< HEAD @@ -114,9 +106,6 @@ # Define the number of structures to select using FPS/CUR n_envs = 25 -#atom_soap = atom_soap.keys_to_properties( -# keys_to_move=["species_neighbor_1", "species_neighbor_2"] -#) print(atom_soap) print(atom_soap.block(0)) @@ -136,11 +125,11 @@ for key, block in selector_atomic_fps.support.items(): print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) -#selector_atomic_cur = sample_selection.CUR(n_to_select=n_envs).fit(atom_soap) -## Print the selected envs for each block -#print("atomic envs selected with CUR:\n") -#for key, block in selector_atomic_cur.support.items(): -# print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) +selector_atomic_cur = sample_selection.CUR(n_to_select=n_envs).fit(atom_soap) +# Print the selected envs for each block +print("atomic envs selected with CUR:\n") +for key, block in selector_atomic_cur.support.items(): + print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) # %% @@ -237,56 +226,6 @@ print("Structure descriptor shape after selection (FPS)", struct_soap_fps.shape) print("Structure descriptor shape after selection (CUR)", struct_soap_cur.shape) -======= -# Makes a dense block -rho2i = rho2i.keys_to_samples(["species_center"]).keys_to_properties( - ["species_neighbor_1", "species_neighbor_2"] -) -# Averages over atomic centers to compute structure features -rho2i_structure = mean_over_samples(rho2i, sample_names=["center", "species_center"]) - -atom_dscrptr = rho2i.block(0).values -struct_dscrptr = rho2i_structure.block(0).values - -print("atom feature descriptor shape:", atom_dscrptr.shape) -print("structure feature descriptor shape:", struct_dscrptr.shape) - - -# %% -# Perform structure (i.e. sample) selection -# ----------------------------------------- -# -# Using FPS and CUR algorithms implemented in scikit-matter, select a subset of -# the structures. skmatter assumes that our descriptor is represented as a 2D -# matrix, with the samples along axis 0 and features along axis 1. -# -# For more info on the functions: `skmatter -# `_ - -# Define the number of structures to select using FPS/CUR -n_structures = 25 - -# FPS sample selection -struct_fps = sample_selection.FPS(n_to_select=n_structures, initialize="random").fit( - struct_dscrptr -) -struct_fps_idxs = struct_fps.selected_idx_ - -# CUR sample selection -struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_dscrptr) -struct_cur_idxs = struct_cur.selected_idx_ - -print("Structure indices obtained with FPS ", struct_fps_idxs) -print("Structure indices obtained with CUR ", struct_cur_idxs) - -# Slice structure descriptor along axis 0 to contain only the selected structures -struct_dscrptr_fps = struct_dscrptr[struct_fps_idxs, :] -struct_dscrptr_cur = struct_dscrptr[struct_cur_idxs, :] -assert struct_dscrptr_fps.shape == struct_dscrptr_cur.shape - -print("Structure descriptor shape before selection ", struct_dscrptr.shape) -print("Structure descriptor shape after selection ", struct_dscrptr_fps.shape) ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f # %% @@ -302,13 +241,8 @@ # Generate a structure PCA -<<<<<<< HEAD struct_soap_pca = PCA(n_components=2).fit_transform(struct_soap.block(0).values) assert struct_soap_pca.shape == (n_frames, 2) -======= -struct_dscrptr_pca = PCA(n_components=2).fit_transform(struct_dscrptr) -assert struct_dscrptr_pca.shape == (n_frames, 2) ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f # %% @@ -320,28 +254,10 @@ # Matplotlib plot fig, ax = plt.subplots(1, 1, figsize=(6, 4)) -<<<<<<< HEAD scatter = ax.scatter(struct_soap_pca[:, 0], struct_soap_pca[:, 1], c="red") ax.plot( struct_soap_pca[struct_cur_idxs, 0], struct_soap_pca[struct_cur_idxs, 1], -======= -scatter = ax.scatter(struct_dscrptr_pca[:, 0], struct_dscrptr_pca[:, 1], c="red") -ax.plot( - struct_dscrptr_pca[struct_cur_idxs, 0], - struct_dscrptr_pca[struct_cur_idxs, 1], ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f - "kx", - label="CUR selection", -) -ax.plot( -<<<<<<< HEAD - struct_soap_pca[struct_fps_idxs, 0], - struct_soap_pca[struct_fps_idxs, 1], -======= - struct_dscrptr_pca[struct_fps_idxs, 0], - struct_dscrptr_pca[struct_fps_idxs, 1], ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f "ko", fillstyle="none", label="FPS selection", @@ -380,22 +296,12 @@ properties.update( { -<<<<<<< HEAD "PC1": struct_soap_pca[:, 0], "PC2": struct_soap_pca[:, 1], -======= - "PC1": struct_dscrptr_pca[:, 0], - "PC2": struct_dscrptr_pca[:, 1], ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f "selection": np.array(selection_levels), } ) -<<<<<<< HEAD -#print(properties) -======= -print(properties) ->>>>>>> 3a073a0c2c8ead4bde227652b2a2f87d86eafb0f # Display with chemiscope. This currently does not work - as raised in issue #8 # https://github.com/lab-cosmo/software-cookbook/issues/8 From 054aebeb86f47896ad33b43928bc0ec063aa26be Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 15:50:21 +0100 Subject: [PATCH 09/12] clean-up sample selection with equisolve --- examples/sample-selection/environment.yml | 1 + examples/sample-selection/sample-selection.py | 96 +++++++++---------- 2 files changed, 45 insertions(+), 52 deletions(-) diff --git a/examples/sample-selection/environment.yml b/examples/sample-selection/environment.yml index 95bed51e..b35b6c7d 100644 --- a/examples/sample-selection/environment.yml +++ b/examples/sample-selection/environment.yml @@ -11,3 +11,4 @@ dependencies: - metatensor - rascaline @ git+https://github.com/Luthaf/rascaline@ca957642f512e141c7570e987aadc05c7ac71983 - skmatter + - equisolve diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py index c4581d3e..27609a40 100644 --- a/examples/sample-selection/sample-selection.py +++ b/examples/sample-selection/sample-selection.py @@ -16,15 +16,14 @@ import ase.io import chemiscope +import metatensor import numpy as np +from equisolve.numpy import feature_selection, sample_selection from matplotlib import pyplot as plt -from metatensor import sum_over_samples -import metatensor +from metatensor import sum_over_samples from rascaline import SoapPowerSpectrum from sklearn.decomposition import PCA -from equisolve.numpy import sample_selection, feature_selection - # %% # Load molecular data @@ -61,25 +60,25 @@ rho2i = calculator.compute(frames) - # Makes a dense block -atom_soap = rho2i.keys_to_properties( - ["species_neighbor_1", "species_neighbor_2"] -) +atom_soap = rho2i.keys_to_properties(["species_neighbor_1", "species_neighbor_2"]) -atom_soap_single_block = atom_soap.keys_to_samples( - keys_to_move=["species_center"] -) +atom_soap_single_block = atom_soap.keys_to_samples(keys_to_move=["species_center"]) -#print(atom_soap_single_block) -#print(atom_soap_single_block.block(0)) # There is only one block now! +# print(atom_soap_single_block) +# print(atom_soap_single_block.block(0)) # There is only one block now! # Sum over atomic centers to compute structure features -struct_soap = sum_over_samples(atom_soap_single_block, sample_names=["center", "species_center"]) +struct_soap = sum_over_samples( + atom_soap_single_block, sample_names=["center", "species_center"] +) print("atom feature descriptor shape:", atom_soap.block(0).values.shape) -print("atom feature descriptor (all in one block) shape:", atom_soap_single_block.block(0).values.shape) +print( + "atom feature descriptor (all in one block) shape:", + atom_soap_single_block.block(0).values.shape, +) print("structure feature descriptor shape:", struct_soap.block(0).values.shape) @@ -102,7 +101,7 @@ # keys. -print('----Atomic environment selection-----') +print("----Atomic environment selection-----") # Define the number of structures to select using FPS/CUR n_envs = 25 @@ -142,11 +141,11 @@ # such that our descriptor is a TensorMap consisting of a single block. Upon # sample selection, the most diverse atomic environments will be selected, # regardless of their chemical species. -print('----All atomic environment selection-----') +print("----All atomic environment selection-----") -print('keys',atom_soap.keys) -print('blocks',atom_soap[0]) -print('samples in first block',atom_soap[0].samples) +print("keys", atom_soap.keys) +print("blocks", atom_soap[0]) +print("samples in first block", atom_soap[0].samples) # Using the original SOAP descriptor, move all keys to properties. @@ -164,7 +163,6 @@ ) - # %% # Perform structure (i.e. sample) selection with FPS/CUR # --------------------------------------------------------- @@ -178,11 +176,7 @@ # Alternatively, one could use the `mean_over_samples` operation, depending on # the specific inhomogeneity of the size of the structures in the training set. -## Sum over atomic environments. #TODO MEAN? -#struct_soap = sum_over_samples(atom_soap, "center") -#print(struct_soap) -#print(struct_soap.block(0)) -print('----Structure selection-----') +print("----Structure selection-----") # Define the number of structures to select *per block* using FPS n_structures = 10 @@ -194,31 +188,15 @@ struct_fps_idxs = selector_struct_fps.support.block(0).samples.values.flatten() print("structures selected with FPS:\n", struct_fps_idxs) -#print("Structure indices obtained with FPS ", struct_fps_idxs) # CUR structure selection selector_struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap) struct_cur_idxs = selector_struct_cur.support.block(0).samples.values.flatten() print("structures selected with CUR:\n", struct_cur_idxs) -#print("Structure indices obtained with CUR ", struct_cur_idxs) - - -#### FPS sample selection -###struct_fps = sample_selection.FPS(n_to_select=n_structures, initialize="random").fit( -### struct_soap -###) -###struct_fps_idxs = struct_fps.selected_idx_ -### -#### CUR sample selection -###struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap) -###struct_cur_idxs = struct_cur.selected_idx_ -### -###print("Structure indices obtained with FPS ", struct_fps_idxs) -###print("Structure indices obtained with CUR ", struct_cur_idxs) -### + # Slice structure descriptor along axis 0 to contain only the selected structures -struct_soap_fps = struct_soap.block(0).values[struct_fps_idxs,:] +struct_soap_fps = struct_soap.block(0).values[struct_fps_idxs, :] struct_soap_cur = struct_soap.block(0).values[struct_cur_idxs, :] assert struct_soap_fps.shape == struct_soap_cur.shape @@ -227,7 +205,6 @@ print("Structure descriptor shape after selection (CUR)", struct_soap_cur.shape) - # %% # Visualize selected structures # ----------------------------- @@ -335,7 +312,7 @@ # Now perform feature selection. In this example we will go back to using the # descriptor decomposed into atomic environments, as opposed to the one # decomposed into structure environments, but only use FPS for brevity. -print('----Feature selection-----') +print("----Feature selection-----") # Define the number of features to select n_features = 200 @@ -346,11 +323,21 @@ ) # Slice atomic descriptor along axis 1 to contain only the selected features -#atom_soap_single_block_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] -atom_soap_single_block_fps=metatensor.slice(atom_soap_single_block, axis="properties", labels=feat_fps.support.block(0).properties) +# atom_soap_single_block_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] +atom_soap_single_block_fps = metatensor.slice( + atom_soap_single_block, + axis="properties", + labels=feat_fps.support.block(0).properties, +) -print("atomic descriptor shape before selection ", atom_soap_single_block.block(0).values.shape) -print("atomic descriptor shape after selection ", atom_soap_single_block_fps.block(0).values.shape) +print( + "atomic descriptor shape before selection ", + atom_soap_single_block.block(0).values.shape, +) +print( + "atomic descriptor shape after selection ", + atom_soap_single_block_fps.block(0).values.shape, +) # %% @@ -362,7 +349,9 @@ # descriptor decomposed into atomic environments, as opposed to the one # decomposed into structure environments, but only use FPS for brevity. from skmatter import feature_selection -print('----Feature selection (skmatter)-----') + + +print("----Feature selection (skmatter)-----") # Define the number of features to select n_features = 200 @@ -378,7 +367,10 @@ # Slice atomic descriptor along axis 1 to contain only the selected features atom_dscrptr_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] -print("atomic descriptor shape before selection ", atom_soap_single_block.block(0).values.shape) +print( + "atomic descriptor shape before selection ", + atom_soap_single_block.block(0).values.shape, +) print("atomic descriptor shape after selection ", atom_dscrptr_fps.shape) # %% From 6a88cf2fa29149929e30fb4d54e52711f13c7b63 Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 19:38:07 +0100 Subject: [PATCH 10/12] Updated environment with equisolve --- examples/sample-selection/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sample-selection/environment.yml b/examples/sample-selection/environment.yml index b35b6c7d..7e2cf3b9 100644 --- a/examples/sample-selection/environment.yml +++ b/examples/sample-selection/environment.yml @@ -11,4 +11,4 @@ dependencies: - metatensor - rascaline @ git+https://github.com/Luthaf/rascaline@ca957642f512e141c7570e987aadc05c7ac71983 - skmatter - - equisolve + - equisolve @ git+https://github.com/lab-cosmo/equisolve.git@c858bedef4b2799eb445e4c92535ee387224089a From 356920ef855cbeaa93f30e434bdd0bf47e14b4ee Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 19:52:22 +0100 Subject: [PATCH 11/12] Changed skmatter feat_selection import to please nox --- examples/sample-selection/sample-selection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py index 27609a40..442004d4 100644 --- a/examples/sample-selection/sample-selection.py +++ b/examples/sample-selection/sample-selection.py @@ -23,6 +23,7 @@ from metatensor import sum_over_samples from rascaline import SoapPowerSpectrum from sklearn.decomposition import PCA +from skmatter import feature_selection as skfeat_selection # %% @@ -348,8 +349,6 @@ # Now perform feature selection. In this example we will go back to using the # descriptor decomposed into atomic environments, as opposed to the one # decomposed into structure environments, but only use FPS for brevity. -from skmatter import feature_selection - print("----Feature selection (skmatter)-----") @@ -357,7 +356,7 @@ n_features = 200 # FPS feature selection -feat_fps = feature_selection.FPS(n_to_select=n_features, initialize="random").fit( +feat_fps = skfeat_selection.FPS(n_to_select=n_features, initialize="random").fit( atom_soap_single_block.block(0).values ) feat_fps_idxs = feat_fps.selected_idx_ From 5edbb941591179a05d14042a08ec15d5dc77bf1d Mon Sep 17 00:00:00 2001 From: Hanna Tuerk Date: Tue, 26 Mar 2024 20:18:44 +0100 Subject: [PATCH 12/12] Fixed underline in sample-selection to be long enough for docs --- examples/sample-selection/sample-selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py index 442004d4..ad309230 100644 --- a/examples/sample-selection/sample-selection.py +++ b/examples/sample-selection/sample-selection.py @@ -85,7 +85,7 @@ # %% # Perform atomic environment (i.e. sample) selection -# ----------------------------------------- +# --------------------------------------------------- # # Using FPS and CUR algorithms, we can perform selection of atomic environments. # These are implemented in equisolve, which provides a wrapper around @@ -344,7 +344,7 @@ # %% # Perform feature selection (skmatter) -# ------------------------- +# ------------------------------------ # # Now perform feature selection. In this example we will go back to using the # descriptor decomposed into atomic environments, as opposed to the one