diff --git a/examples/sample-selection/environment.yml b/examples/sample-selection/environment.yml index 95bed51e..7e2cf3b9 100644 --- a/examples/sample-selection/environment.yml +++ b/examples/sample-selection/environment.yml @@ -11,3 +11,4 @@ dependencies: - metatensor - rascaline @ git+https://github.com/Luthaf/rascaline@ca957642f512e141c7570e987aadc05c7ac71983 - skmatter + - equisolve @ git+https://github.com/lab-cosmo/equisolve.git@c858bedef4b2799eb445e4c92535ee387224089a diff --git a/examples/sample-selection/sample-selection.py b/examples/sample-selection/sample-selection.py index 34030c64..ad309230 100644 --- a/examples/sample-selection/sample-selection.py +++ b/examples/sample-selection/sample-selection.py @@ -16,12 +16,14 @@ import ase.io import chemiscope +import metatensor import numpy as np +from equisolve.numpy import feature_selection, sample_selection from matplotlib import pyplot as plt -from metatensor import mean_over_samples +from metatensor import sum_over_samples from rascaline import SoapPowerSpectrum from sklearn.decomposition import PCA -from skmatter import feature_selection, sample_selection +from skmatter import feature_selection as skfeat_selection # %% @@ -57,54 +59,151 @@ # Generate a SOAP power spectrum calculator = SoapPowerSpectrum(**hypers) rho2i = calculator.compute(frames) + + # Makes a dense block -rho2i = rho2i.keys_to_samples(["species_center"]).keys_to_properties( - ["species_neighbor_1", "species_neighbor_2"] +atom_soap = rho2i.keys_to_properties(["species_neighbor_1", "species_neighbor_2"]) + +atom_soap_single_block = atom_soap.keys_to_samples(keys_to_move=["species_center"]) + +# print(atom_soap_single_block) +# print(atom_soap_single_block.block(0)) # There is only one block now! + +# Sum over atomic centers to compute structure features +struct_soap = sum_over_samples( + atom_soap_single_block, sample_names=["center", "species_center"] ) -# Averages over atomic centers to compute structure features -rho2i_structure = mean_over_samples(rho2i, sample_names=["center", "species_center"]) -atom_dscrptr = rho2i.block(0).values -struct_dscrptr = rho2i_structure.block(0).values -print("atom feature descriptor shape:", atom_dscrptr.shape) -print("structure feature descriptor shape:", struct_dscrptr.shape) +print("atom feature descriptor shape:", atom_soap.block(0).values.shape) +print( + "atom feature descriptor (all in one block) shape:", + atom_soap_single_block.block(0).values.shape, +) +print("structure feature descriptor shape:", struct_soap.block(0).values.shape) # %% -# Perform structure (i.e. sample) selection -# ----------------------------------------- +# Perform atomic environment (i.e. sample) selection +# --------------------------------------------------- # -# Using FPS and CUR algorithms implemented in scikit-matter, select a subset of -# the structures. skmatter assumes that our descriptor is represented as a 2D -# matrix, with the samples along axis 0 and features along axis 1. +# Using FPS and CUR algorithms, we can perform selection of atomic environments. +# These are implemented in equisolve, which provides a wrapper around +# scikit-matter to allow for interfacing with data stored in the metatensor +# format. # -# For more info on the functions: `skmatter -# `_ +# Suppose we want to select the 10 most diverse environments for each chemical +# species. +# +# First, we can use the `keys_to_properties` operation in metatensor to move the +# neighbour species indices to the properties of the TensorBlocks. The resulting +# descriptor will be a TensorMap comprised of three blocks, one for each +# chemical species, where the chemical species indices are solely present in the +# keys. + +print("----Atomic environment selection-----") # Define the number of structures to select using FPS/CUR -n_structures = 25 +n_envs = 25 + +print(atom_soap) +print(atom_soap.block(0)) + +# %% Now let's perform sample selection on the atomic environments. We want to +# select 10 atomic environments for each chemical species. + +# Define the number of structures *per block* to select using FPS +n_envs = 10 # FPS sample selection -struct_fps = sample_selection.FPS(n_to_select=n_structures, initialize="random").fit( - struct_dscrptr +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + atom_soap ) -struct_fps_idxs = struct_fps.selected_idx_ -# CUR sample selection -struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_dscrptr) -struct_cur_idxs = struct_cur.selected_idx_ +# Print the selected envs for each block +print("atomic envs selected with FPS:\n") +for key, block in selector_atomic_fps.support.items(): + print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) + +selector_atomic_cur = sample_selection.CUR(n_to_select=n_envs).fit(atom_soap) +# Print the selected envs for each block +print("atomic envs selected with CUR:\n") +for key, block in selector_atomic_cur.support.items(): + print("species_center:", key, "\n(struct_idx, atom_idx)\n", block.samples.values) + + +# %% +# Selecting from a combined pool of atomic environments +# ----------------------------------------------------- +# +# One can also select from a combined pool of atomic environments and +# structures, instead of selecting an equal number of atomic environments for +# each chemical species. In this case, we can move the 'species_center' key to samples +# such that our descriptor is a TensorMap consisting of a single block. Upon +# sample selection, the most diverse atomic environments will be selected, +# regardless of their chemical species. +print("----All atomic environment selection-----") + +print("keys", atom_soap.keys) +print("blocks", atom_soap[0]) +print("samples in first block", atom_soap[0].samples) + +# Using the original SOAP descriptor, move all keys to properties. + + +# Define the number of structures to select using FPS +n_envs = 10 + +# FPS sample selection +selector_atomic_fps = sample_selection.FPS(n_to_select=n_envs, initialize="random").fit( + atom_soap_single_block +) +print( + "atomic envs selected with FPS: \n (struct_idx, atom_idx, species_center) \n", + selector_atomic_fps.support.block(0).samples.values, +) + + +# %% +# Perform structure (i.e. sample) selection with FPS/CUR +# --------------------------------------------------------- +# +# Instead of atomic environments, one can also select diverse structures. We can +# use the `sum_over_samples` operation in metatensor to define features in the +# structural basis instead of the atomic basis. This is done by summing over the +# atomic environments, labeled by the 'center' index in the samples of the +# TensorMap. +# +# Alternatively, one could use the `mean_over_samples` operation, depending on +# the specific inhomogeneity of the size of the structures in the training set. + +print("----Structure selection-----") + +# Define the number of structures to select *per block* using FPS +n_structures = 10 + +# FPS structure selection +selector_struct_fps = sample_selection.FPS( + n_to_select=n_structures, initialize="random" +).fit(struct_soap) +struct_fps_idxs = selector_struct_fps.support.block(0).samples.values.flatten() + +print("structures selected with FPS:\n", struct_fps_idxs) + +# CUR structure selection +selector_struct_cur = sample_selection.CUR(n_to_select=n_structures).fit(struct_soap) +struct_cur_idxs = selector_struct_cur.support.block(0).samples.values.flatten() +print("structures selected with CUR:\n", struct_cur_idxs) -print("Structure indices obtained with FPS ", struct_fps_idxs) -print("Structure indices obtained with CUR ", struct_cur_idxs) # Slice structure descriptor along axis 0 to contain only the selected structures -struct_dscrptr_fps = struct_dscrptr[struct_fps_idxs, :] -struct_dscrptr_cur = struct_dscrptr[struct_cur_idxs, :] -assert struct_dscrptr_fps.shape == struct_dscrptr_cur.shape +struct_soap_fps = struct_soap.block(0).values[struct_fps_idxs, :] +struct_soap_cur = struct_soap.block(0).values[struct_cur_idxs, :] +assert struct_soap_fps.shape == struct_soap_cur.shape -print("Structure descriptor shape before selection ", struct_dscrptr.shape) -print("Structure descriptor shape after selection ", struct_dscrptr_fps.shape) +print("Structure descriptor shape before selection ", struct_soap.block(0).values.shape) +print("Structure descriptor shape after selection (FPS)", struct_soap_fps.shape) +print("Structure descriptor shape after selection (CUR)", struct_soap_cur.shape) # %% @@ -120,8 +219,8 @@ # Generate a structure PCA -struct_dscrptr_pca = PCA(n_components=2).fit_transform(struct_dscrptr) -assert struct_dscrptr_pca.shape == (n_frames, 2) +struct_soap_pca = PCA(n_components=2).fit_transform(struct_soap.block(0).values) +assert struct_soap_pca.shape == (n_frames, 2) # %% @@ -133,16 +232,10 @@ # Matplotlib plot fig, ax = plt.subplots(1, 1, figsize=(6, 4)) -scatter = ax.scatter(struct_dscrptr_pca[:, 0], struct_dscrptr_pca[:, 1], c="red") -ax.plot( - struct_dscrptr_pca[struct_cur_idxs, 0], - struct_dscrptr_pca[struct_cur_idxs, 1], - "kx", - label="CUR selection", -) +scatter = ax.scatter(struct_soap_pca[:, 0], struct_soap_pca[:, 1], c="red") ax.plot( - struct_dscrptr_pca[struct_fps_idxs, 0], - struct_dscrptr_pca[struct_fps_idxs, 1], + struct_soap_pca[struct_cur_idxs, 0], + struct_soap_pca[struct_cur_idxs, 1], "ko", fillstyle="none", label="FPS selection", @@ -181,13 +274,12 @@ properties.update( { - "PC1": struct_dscrptr_pca[:, 0], - "PC2": struct_dscrptr_pca[:, 1], + "PC1": struct_soap_pca[:, 0], + "PC2": struct_soap_pca[:, 1], "selection": np.array(selection_levels), } ) -print(properties) # Display with chemiscope. This currently does not work - as raised in issue #8 # https://github.com/lab-cosmo/software-cookbook/issues/8 @@ -221,22 +313,63 @@ # Now perform feature selection. In this example we will go back to using the # descriptor decomposed into atomic environments, as opposed to the one # decomposed into structure environments, but only use FPS for brevity. +print("----Feature selection-----") # Define the number of features to select n_features = 200 # FPS feature selection feat_fps = feature_selection.FPS(n_to_select=n_features, initialize="random").fit( - atom_dscrptr + atom_soap_single_block +) + +# Slice atomic descriptor along axis 1 to contain only the selected features +# atom_soap_single_block_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] +atom_soap_single_block_fps = metatensor.slice( + atom_soap_single_block, + axis="properties", + labels=feat_fps.support.block(0).properties, +) + +print( + "atomic descriptor shape before selection ", + atom_soap_single_block.block(0).values.shape, +) +print( + "atomic descriptor shape after selection ", + atom_soap_single_block_fps.block(0).values.shape, +) + +# %% + +# %% +# Perform feature selection (skmatter) +# ------------------------------------ +# +# Now perform feature selection. In this example we will go back to using the +# descriptor decomposed into atomic environments, as opposed to the one +# decomposed into structure environments, but only use FPS for brevity. + +print("----Feature selection (skmatter)-----") + +# Define the number of features to select +n_features = 200 + +# FPS feature selection +feat_fps = skfeat_selection.FPS(n_to_select=n_features, initialize="random").fit( + atom_soap_single_block.block(0).values ) feat_fps_idxs = feat_fps.selected_idx_ print("Feature indices obtained with FPS ", feat_fps_idxs) # Slice atomic descriptor along axis 1 to contain only the selected features -atom_dscrptr_fps = atom_dscrptr[:, feat_fps_idxs] +atom_dscrptr_fps = atom_soap_single_block.block(0).values[:, feat_fps_idxs] -print("atomic descriptor shape before selection ", atom_dscrptr.shape) +print( + "atomic descriptor shape before selection ", + atom_soap_single_block.block(0).values.shape, +) print("atomic descriptor shape after selection ", atom_dscrptr_fps.shape) # %%