From b1502873e5e95b6be0a9ee874cf285c1c6c92b13 Mon Sep 17 00:00:00 2001 From: franck-simon <55919349+franck-simon@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:29:31 +0200 Subject: [PATCH] Miic v2 (tmiic) (#131) Implementation of tMIIC --- .gitignore | 1 + DESCRIPTION | 45 +- NAMESPACE | 3 + NEWS.md | 71 +- R/data.R | 17 + R/discretizeMutual.R | 8 +- R/miic.R | 878 +++++++++++++++------- R/miic.plot.R | 3 - R/miic.reconstruct.R | 27 +- R/miic.utils.R | 894 +++++++++++++++------- R/parseResults.R | 5 +- R/tmiic.plot.R | 1190 ++++++++++++++++++++++++++++++ R/tmiic.utils.R | 1272 ++++++++++++++++++++++++++++++++ R/tmiic.wrapper.R | 762 +++++++++++++++++++ README.md | 15 +- data/covidCases.rda | Bin 0 -> 1977 bytes data/datalist | 1 + man/covidCases.Rd | 22 + man/discretizeMutual.Rd | 8 +- man/estimateTemporalDynamic.Rd | 82 ++ man/miic.Rd | 651 ++++++++++------ man/plot.tmiic.Rd | 145 ++++ man/tmiic.export.Rd | 115 +++ src/computation_continuous.cpp | 12 +- src/environment.h | 34 +- src/get_information.cpp | 11 + src/mutual_information.cpp | 5 +- src/orientation.cpp | 196 ++++- src/orientation.h | 1 + src/proba_orientation.cpp | 45 +- src/proba_orientation.h | 3 +- src/r_cpp_interface.cpp | 88 +++ src/reconstruct.cpp | 20 +- src/tmiic.cpp | 314 ++++++++ src/tmiic.h | 29 + 35 files changed, 6126 insertions(+), 847 deletions(-) create mode 100644 R/tmiic.plot.R create mode 100644 R/tmiic.utils.R create mode 100644 R/tmiic.wrapper.R create mode 100644 data/covidCases.rda create mode 100644 man/covidCases.Rd create mode 100644 man/estimateTemporalDynamic.Rd create mode 100644 man/plot.tmiic.Rd create mode 100644 man/tmiic.export.Rd create mode 100644 src/tmiic.cpp create mode 100644 src/tmiic.h diff --git a/.gitignore b/.gitignore index 20e06d5b..41e4d986 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # RStudio *.Rproj *.Rproj.user +.Rproj.user # R .Rhistory .RData diff --git a/DESCRIPTION b/DESCRIPTION index f9ef36ab..62a46141 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,23 +1,39 @@ Package: miic Title: Learning Causal or Non-Causal Graphical Models Using Information Theory -Version: 1.9.0 +Version: 2.0.0 Authors@R: - c(person(given = "Vincent", - family = "Cabeli", + c(person(given = "Franck", + family = "Simon", role = c("aut", "cre"), - email = "vincent.cabeli@curie.fr"), + email = "franck.simon@curie.fr"), + person(given = "Tiziana", + family = "Tocci", + role = "aut", + email = "tiziana.tocci@curie.fr"), + person(given = "Nikita", + family = "Lagrange", + role = "aut", + email = "nikita.lagrange@curie.fr"), + person(given = "Orianne", + family = "Debeaupuis", + role = "aut", + email = "orianne.debeaupuis@curie.fr"), + person(given = "Louise", + family = "Dupuis", + role = "aut", + email = "louise.dupuis@curie.fr"), + person(given = "Vincent", + family = "Cabeli", + role = "aut"), person(given = "Honghao", family = "Li", - role = "aut", - email = "honghao.li@curie.fr"), + role = "aut"), person(given = "Marcel", family = "Ribeiro Dantas", - role = "aut", - email = "marcel.ribeiro-dantas@curie.fr"), + role = "aut"), person(given = "Nadir", family = "Sella", - role = "aut", - email = "nadir.sella@curie.fr"), + role = "aut"), person(given = "Louis", family = "Verny", role = "aut"), @@ -27,7 +43,7 @@ Authors@R: person(given = "Hervé", family = "Isambert", role = "aut", - email = "Herve.Isambert@curie.fr")) + email = "herve.isambert@curie.fr")) Description: We report an information-theoretic method which learns a large class of causal or non-causal graphical models from purely observational data, while including the effects of unobserved latent variables, commonly @@ -39,7 +55,10 @@ Description: We report an information-theoretic method which learns a large be applied on a wide range of datasets and provide new biological insights on regulatory networks from single cell expression data, genomic alterations during tumor development and co-evolving residues in protein structures. - For more information you can refer to: + Since the version 2.0, MIIC can in addition process stationary time series + to unveil temporal causal graphs. + For more information, you can refer to: + Simon et al. eLife, reviewed preprint , Cabeli et al. PLoS Comp. Bio. 2020 , Verny et al. PLoS Comp. Bio. 2017 . License: GPL (>= 2) @@ -60,4 +79,4 @@ LinkingTo: SystemRequirements: C++14 LazyData: true Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 diff --git a/NAMESPACE b/NAMESPACE index 53d7a453..35ae61e1 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,14 +1,17 @@ # Generated by roxygen2: do not edit by hand S3method(plot,miic) +S3method(plot,tmiic) export(computeMutualInfo) export(computeThreePointInfo) export(discretizeMDL) export(discretizeMutual) +export(estimateTemporalDynamic) export(miic) export(miic.export) export(miic.write.network.cytoscape) export(miic.write.style.cytoscape) +export(tmiic.export) import(Rcpp) importFrom(stats,density) importFrom(stats,sd) diff --git a/NEWS.md b/NEWS.md index d637b348..a5055187 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,13 +1,72 @@ # Development version +# v2.0.0 + +## Features + +- tMIIC version for temporal causal discovery on stationary time series: + new mode of MIIC to reconstruct networks from temporal stationary datasets. + [Simon et al., eLife, reviewed preprint] + (https://www.biorxiv.org/content/10.1101/2024.02.06.579177v1.abstract) + +## Known issues + +- A (very) large number of contributors can lead to a memory fault. + Initial fix has been reverted due to side effects. + +# v1.8.1 + +## Fixes and improvements + +- The discretization of continuous variables has been modified when dealing + with variables having a large number of identical values. + +- Fix for memory overflow on shared memory space. + +# v1.8.0 + +## Features + +- Addition of the 'is consequence' prior knowledge. Consequence variables are + excluded from the possible contributors, edges between consequences are + ignored and edges between a non consequence and a consequence are pre-oriented + toward the consequence. + +# v1.7.0 + +## Features + +- iMIIC version introducing genuine vs putative causes, contextual variables + and multiple enhancements to deal with very large datasets. + [Ribeiro-Dantas et al., iScience 2024] + (https://arxiv.org/abs/2303.06423) + +# v1.6.0 + +## Features + +- Enhancement of orientations using mutual information supremum principle for + finite datasets. + [Cabeli et al., Why21 at NeurIPS 2021] + (http://kinefold.curie.fr/isambertlab/PAPERS/cabeli_Why21-NeurIPS2021.pdf) + +- By default, MIIC does not propagate orientations anymore + and allows latent variables discovery during orientation step. + +# v1.5.3 + +## Features + +- Release to CRAN + # v1.5.2 ## Fixes and improvements -- Further refactoring of the C++ code for the computation of information +- Further refactoring of the C++ code for the computation of information. -- Fix minor bugs in the continuous computation +- Fix minor bugs in the continuous computation. -- Fix incompatibility with older versions of GCC (std::align) +- Fix incompatibility with older versions of GCC (std::align). # v1.5.1 @@ -29,11 +88,11 @@ edges. ## Fixes and improvements -- Faster post processing in R +- Faster post processing in R. -- Rework plot functionality +- Rework plot functionality. -- Fix a bug in the orientation part about the log score +- Fix a bug in the orientation part about the log score. - Refactor of the C++ code base (orientation). diff --git a/R/data.R b/R/data.R index 2b2e7d88..5fcd16d8 100755 --- a/R/data.R +++ b/R/data.R @@ -101,3 +101,20 @@ NULL #' #' @keywords data NULL + +#' Covid cases +#' +#' Demo dataset of chronological series to be used in temporal mode of miic. +#' Evolution of Covid cases on a subset of EU countries from 12/31/2019 to 06/18/2020. +#' Source of the data : European Centre for Disease Prevention and Control. +#' +#' @docType data +#' @usage data(covidCases) +#' @format A data.frame object. +#' @keywords datasets +#' @name covidCases +#' @references ECDC (\href{https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide}{ECDC link}) +#' +#' @keywords data +NULL + diff --git a/R/discretizeMutual.R b/R/discretizeMutual.R index 6dc7fd39..e65d88f3 100755 --- a/R/discretizeMutual.R +++ b/R/discretizeMutual.R @@ -1,6 +1,6 @@ #' Iterative dynamic programming for (conditional) mutual information through optimized discretization. #' @description This function chooses cutpoints in the input distributions by maximizing the mutual -#' information minus a complexity cost (computed as BIC or with the Normalized Maximum Likelihood ). The +#' information minus a complexity cost (computed as BIC or with the Normalized Maximum Likelihood). The #' (conditional) mutual information computed on the optimized discretized distributions effectively approaches #' the mutual information computed on the original continuous variables. #' @@ -52,11 +52,13 @@ #' #' @return A list that contains : #' \itemize{ -#' \item{two vectors containing the cutpoints for each variable : \emph{cutpoints1} corresponds to /emph{myDist1}, /emph{cutpoints2} corresponds to /emph{myDist2}.} +#' \item{two vectors containing the cutpoints for each variable : +#' \emph{cutpoints1} corresponds to \emph{X}, +#' \emph{cutpoints2} corresponds to \emph{Y}.} #' \item{\emph{niterations} is the number of iterations performed before convergence of the (C)MI estimation.} #' \item{\emph{iterationN}, lists contatining the cutpoint vectors for each iteration.} #' \item{\emph{info} and \emph{infok}, the estimated (C)MI value and (C)MI minus the complexity cost.} -#' \item{if $emph{plot} == TRUE, a plot object (requires ggplot2 and gridExtra).} +#' \item{if \emph{plot} == TRUE, a plot object (requires ggplot2 and gridExtra).} #' } #' @export #' @useDynLib miic diff --git a/R/miic.R b/R/miic.R index fc0747ba..e7fd51a1 100755 --- a/R/miic.R +++ b/R/miic.R @@ -5,12 +5,21 @@ #' from indirect effects amongst correlated variables, including cause-effect #' relationships and the effect of unobserved latent causes. #' -#' @details Starting from a complete graph, the method iteratively removes +#' @details In standard mode, starting from a complete graph, the method iteratively removes #' dispensable edges, by uncovering significant information contributions from #' indirect paths, and assesses edge-specific confidences from randomization of #' available data. The remaining edges are then oriented based on the signature #' of causality in observational data. #' +#' In temporal mode, miic reorganizes the dataset using the \emph{n_layers} and +#' \emph{delta_t} parameters to transform the time steps into lagged samples. +#' As starting point, a lagged graph is created with only edges having at +#' least one node laying on the last time step. +#' Then, miic standard algorithm is applied to remove dispensable edges. +#' The remaining edges are then oriented by using the temporality and the +#' signature of causality in observational data. The use of temporal mode +#' is exposed in Simon \emph{et al.}, eLife reviewed preprint 2024. +#' #' The method relies on an information theoretic based (conditional) independence #' test which is described in (Verny \emph{et al.}, PLoS Comp. Bio. 2017), #' (Cabeli \emph{et al.}, PLoS Comp. Bio. 2020). It deals with both categorical @@ -28,276 +37,460 @@ #' #' @references #' \itemize{ -#' \item Verny et al., \emph{PLoS Comp. Bio. 2017.} https://doi.org/10.1371/journal.pcbi.1005662 -#' \item Cabeli et al., \emph{PLoS Comp. Bio. 2020.} https://doi.org/10.1371/journal.pcbi.1007866 -#' \item Li et al., \emph{NeurIPS 2019} http://papers.nips.cc/paper/9573-constraint-based-causal-structure-learning-with-consistent-separating-sets.pdf +#' \item{Simon et al., \emph{eLife reviewed preprint} https://www.biorxiv.org/content/10.1101/2024.02.06.579177v1.abstract } +#' \item{Ribeiro-Dantas et al., \emph{iScience 2024} https://arxiv.org/abs/2303.06423 } +#' \item{Cabeli et al., \emph{PLoS Comp. Bio. 2020.} https://doi.org/10.1371/journal.pcbi.1007866 } +#' \item{Li et al., \emph{NeurIPS 2019} http://papers.nips.cc/paper/9573-constraint-based-causal-structure-learning-with-consistent-separating-sets.pdf } +#' \item{Verny et al., \emph{PLoS Comp. Bio. 2017.} https://doi.org/10.1371/journal.pcbi.1005662 } #' } #' -#' @param input_data [a data frame] +#' @param input_data [a data frame, required] +#' #' A n*d data frame (n samples, d variables) that contains the observational data. -#' Each column corresponds to one variable and each row is a sample that gives the -#' values for all the observed variables. The column names correspond to the -#' names of the observed variables. Numeric columns will be treated as continuous -#' values, factors and character as categorical. -#' -#' @param black_box [a data frame] -#' An optional E*2 data frame containing E pairs of variables that will be considered -#' as independent during the network reconstruction. In practice, these edges will not -#' be included in the skeleton initialization and cannot be part of the final result. -#' Variable names must correspond to the \emph{input_data} data frame. -#' -#' @param n_eff [a positive integer] -#' The n samples given in the \emph{input_data} data frame are expected -#' to be independent. In case of correlated samples such as in time series or -#' Monte Carlo sampling approaches, the effective number of independent samples -#' \emph{n_eff} can be estimated using the decay of the autocorrelation function -#' (Verny \emph{et al.}, PLoS Comp. Bio. 2017). This \emph{effective} number \emph{n_eff} -#' of \emph{independent} samples can be provided using this parameter. -#' -#' @param cplx [a string; \emph{c("nml", "mdl")}] -#' In practice, the finite size of the input -#' dataset requires that the 2-point and 3-point information measures should be -#' \emph{shifted} by a \emph{complexity} term. The finite size corrections can be -#' based on the Minimal Description Length (MDL) criterion (set the option with "mdl"). -#' In practice, the MDL complexity criterion tends to underestimate the relevance of -#' edges connecting variables with many different categories, leading to the removal of -#' false negative edges. To avoid such biases with finite datasets, the (universal) -#' Normalized Maximum Likelihood (NML) criterion can be used (set the option with "nml"). -#' The default is "nml" (see Affeldt \emph{et al.}, UAI 2015). -#' -#' @param latent [a string; \emph{c("orientation", "no", "yes")}] -#' When set to "yes", the network reconstruction is taking into account hidden (latent) -#' variables. When set to "orientation", latent variables are not considered during the skeleton -#' reconstruction but allows bi-directed edges during the orientation. Dependence -#' between two observed variables due to a latent variable is indicated with a '6' in -#' the adjacency matrix and in the network edges.summary and by a bi-directed edge -#' in the (partially) oriented graph. -#' -#' @param orientation [a boolean value] -#' The miic network skeleton can be partially directed -#' by orienting and propagating edge directions, based on the sign and magnitude -#' of the conditional 3-point information of unshielded triples. The propagation -#' procedure relyes on probabilities; for more details, see Verny \emph{et al.}, PLoS Comp. Bio. 2017). -#' If set to FALSE the orientation step is not performed. -#' -#' @param ori_proba_ratio [a floating point between 0 and 1] The threshold when -#' deducing the type of an edge tip (head/tail) from the probability of -#' orientation. For a given edge tip, denote by p the probability of it being a -#' head, the orientation is accepted if (1 - p) / p < ori_proba_ratio. 0 means -#' reject all orientations, 1 means accept all orientations. -#' -#' @param ori_consensus_ratio [a floating point between 0 and 1] The threshold -#' when deducing the type of an consensus edge tip (head/tail) from the average -#' probability of orientation. For a given edge tip, denote by p the probability -#' of it being a head, the orientation is accepted if (1 - p) / p < -#' ori_consensus_ratio. 0 means reject all orientations, 1 means accept all -#' orientations. -#' -#' @param propagation [a boolean value] -#' If set to FALSE, the skeleton is partially oriented with only the -#' v-structure orientations. Otherwise, the v-structure orientations are -#' propagated to downstream undirected edges in unshielded triples following -#' the orientation method #' -#' @param state_order [a data frame] An optional data frame providing extra -#' information for variables. It must have d rows where d is the number of input -#' variables, and the following structure (named columns): +#' In standard mode, each column corresponds to one variable and each row is a +#' sample that gives the values for all the observed variables. +#' The column names correspond to the names of the observed variables. +#' Numeric columns with at least 5 distinct values will be treated as continuous +#' by default whilst numeric columns with less than 5 distinct values, factors +#' and characters will be considered as categorical. +#' +#' In temporal mode, the expected data frame layout is variables as columns +#' and time series/time steps as rows. +#' The time step information must be supplied in the first column and, +#' for each time series, be consecutive and in ascending order (increment of 1). +#' Multiple trajectories can be provided, miic will consider that a new trajectory +#' starts each time a smaller time step than the one of the previous row is encountered. +#' +#' @param state_order [a data frame, optional, NULL by default] +#' +#' A data frame providing extra information for variables. It must have d rows +#' where d is the number of input variables and possible columns are described +#' below. For optional columns, if they are not provided or contain missing +#' values, default values suitable for \emph{input_data} will be used. #' -#' "var_names" (required) contains the name of each variable as specified -#' by colnames(input_data). +#' \emph{"var_names"} (required) contains the name of each variable as specified +#' by colnames(input_data). In temporal mode, the time steps column should +#' not be mentioned in the variables list. #' -#' "var_type" (optional) contains a binary value that specifies if each +#' \emph{"var_type"} (optional) contains a binary value that specifies if each #' variable is to be considered as discrete (0) or continuous (1). #' -#' "levels_increasing_order" (optional) contains a single character string +#' \emph{"levels_increasing_order"} (optional) contains a single character string #' with all of the unique levels of the ordinal variable in increasing order, #' delimited by comma ','. It will be used during the post-processing to compute #' the sign of an edge using Spearman's rank correlation. If a variable is #' continuous or is categorical but not ordinal, this column should be NA. #' -#' "is_contextual" (optional) contains a binary value that specifies if a +#' \emph{"is_contextual"} (optional) contains a binary value that specifies if a #' variable is to be considered as a contextual variable (1) or not (0). #' Contextual variables cannot be the child node of any other variable (cannot #' have edge with arrowhead pointing to them). #' -#' "is_consequence" (optional) contains a binary value that specifies if a +#' \emph{"is_consequence"} (optional) contains a binary value that specifies if a #' variable is to be considered as a consequence variable (1) or not (0). -#' Consequence variables cannot be the parent node of any other variable -#' and cannot be used as contributors. -#' -#' @param true_edges [a data frame] -#' An optional E*2 data frame containing the E edges of the true graph for -#' computing performance after the run. -#' -#' @param n_shuffles [a positive integer] The number of shufflings of -#' the original dataset in order to evaluate the edge specific confidence -#' ratio of all inferred edges. Default is 0: no confidence cut. If the -#' number of shufflings is set to an integer > 0, the confidence threshold -#' must also be > 0 (i.e: n_shuffles=100 and conf_threshold=0.01). -#' -#' @param conf_threshold [a positive floating point] The threshold used -#' to filter the less probable edges following the skeleton step. See Verny -#' \emph{et al.}, PLoS Comp. Bio. 2017. Default is 0: no confidence cut. If the -#' the confidence threshold is set > 0, the number of shufflings must also -#' be defined > 0 (i.e: n_shuffles=100 and conf_threshold=0.01). -#' -#' @param sample_weights [a numeric vector] -#' An optional vector containing the weight of each observation. NULL by default. -#' If defined, it must be a vector of floats in the range [0,1] of size equal -#' to the number of samples. -#' -#' @param test_mar [a boolean value] -#' If set to TRUE, distributions with missing values will be tested with Kullback-Leibler -#' divergence : conditioning variables for the given link \eqn{X\rightarrow Y}\eqn{Z} will be -#' considered only if the divergence between the full distribution and the non-missing -#' distribution \eqn{KL(P(X,Y) | P(X,Y)_{!NA})} is low enough (with \eqn{P(X,Y)_{!NA}} as -#' the joint distribution of \eqn{X} and \eqn{Y} on samples which are not missing on Z. +#' Edges between consequence variables are ignored, consequence variables +#' cannot be the parent node of any other variable and cannot be used as +#' contributors. Edges between a non consequence and consequence variables +#' are pre-oriented toward the consequence. +#' +#' Several other columns are possible in temporal mode: +#' +#' \emph{"n_layers"} (optional) contains an integer value that specifies the +#' number of layers to be considered for the variable. +#' Note that if a \emph{"n_layers"} column is present in the \emph{state_order}, +#' its values will overwrite the function parameter. +#' +#' \emph{"delta_t"} (optional) contains an integer value that specifies the number +#' of time steps between each layer for the variable. +#' Note that if a \emph{"delta_t"} column is present in the \emph{state_order}, +#' its values will overwrite the function parameter. +#' +#' \emph{"movavg"} (optional) contains an integer value that specifies the size of +#' the moving average window to be applied to the variable. +#' Note that if \emph{"movavg"} column is present in the \emph{state_order}, +#' its values will overwrite the function parameter. +#' +#' @param true_edges [a data frame, optional, NULL by default] +#' +#' A data frame containing the edges of the true graph for +#' computing performance after the run.\cr +#' In standard mode, the expected layout is a two columns data frame, each row +#' representing a true edge with in each column, the variable names. +#' Variables names must exist in the \emph{input_data} data frame.\cr +#' In temporal mode, the expected layout is a three columns data frame, +#' with the first two columns being variable names and the third the lag. +#' Variables names must exist in the \emph{input_data} data frame and the lag +#' must be valid in the time unfolded graph. e.g. a row var1, var2, 3 is valid +#' with \emph{n_layers} = 4 + \emph{delta_t} = 1 or +#' \emph{n_layers} = 2 + \emph{delta_t} = 3 +#' but not for \emph{n_layers} = 2 + \emph{delta_t} = 2 as there is no matching +#' edge in the time unfolded graph. Please note that the order is important: +#' var1, var2, 3 is interpreted as var1_lag3 - var2_lag0. Please note also that, +#' for contextual variables that are not lagged, the expected value in the +#' third column for the time lag is NA. +#' +#' @param black_box [a data frame, optional, NULL by default] +#' +#' A data frame containing pairs of variables that will be considered +#' as independent during the network reconstruction. In practice, these edges +#' will not be included in the skeleton initialization and cannot be part of +#' the final result.\cr +#' In standard mode, the expected layout is a two columns data frame, each row +#' representing a forbidden edge with in each column, the variable names. +#' Variables names must exist in the \emph{input_data} data frame.\cr +#' In temporal mode, the expected layout is a three columns data frame, +#' with the first two columns being variable names and the third the lag. +#' Variables names must exist in the \emph{input_data} data frame and the lag +#' must be valid in the time unfolded graph. e.g. a row var1, var2, 3 is valid +#' with \emph{n_layers} = 4 + \emph{delta_t} = 1 or +#' \emph{n_layers} = 2 + \emph{delta_t} = 3 +#' but not for \emph{n_layers} = 2 + \emph{delta_t} = 2 as there is no matching +#' edge in the time unfolded graph. Please note that the order is important: +#' var1, var2, 3 is interpreted as var1_lag3 - var2_lag0. Please note also that, +#' for contextual variables that are not lagged, the expected value in the +#' third column for the time lag is NA. +#' +#' @param n_threads [a positive integer, optional, 1 by default] +#' +#' When set greater than 1, n_threads parallel threads will be used for computation. Make sure +#' your compiler is compatible with openmp if you wish to use multithreading. +#' +#' @param cplx [a string, optional, "nml" by default, possible values: +#' "nml", "mdl"] +#' +#' In practice, the finite size of the input dataset requires that +#' the 2-point and 3-point information measures should be \emph{shifted} +#' by a \emph{complexity} term. The finite size corrections can be based on +#' the Minimal Description Length (MDL) criterion. +#' However, the MDL complexity criterion tends to underestimate the +#' relevance of edges connecting variables with many different categories, +#' leading to the removal of false negative edges. To avoid such biases +#' with finite datasets, the (universal) Normalized Maximum Likelihood (NML) +#' criterion can be used (see Affeldt \emph{et al.}, UAI 2015). +#' +#' @param orientation [a boolean value, optional, TRUE by default] +#' +#' The miic network skeleton can be partially directed by orienting +#' edge directions, based on the sign and magnitude of the conditional +#' 3-point information of unshielded triples and, in temporal mode, using time. +#' If set to FALSE, the orientation step is not performed. +#' +#' @param ori_proba_ratio [a floating point between 0 and 1, optional, +#' 1 by default] +#' +#' The threshold when deducing the type of an edge tip (head/tail) +#' from the probability of orientation. +#' For a given edge tip, denote by p the probability of it being a head, +#' the orientation is accepted if (1 - p) / p < \emph{ori_proba_ratio}. +#' 0 means reject all orientations, 1 means accept all orientations. +#' +#' @param ori_consensus_ratio [a floating point between 0 and 1, optional, +#' NULL by default] +#' +#' The threshold when deducing the type of an consensus edge tip (head/tail) +#' from the average probability of orientation. +#' For a given edge tip, denote by p the probability of it being a head, +#' the orientation is accepted if (1 - p) / p < \emph{ori_consensus_ratio}. +#' 0 means reject all orientations, 1 means accept all orientations. +#' If not supplied, the \emph{ori_consensus_ratio} will be initialized with +#' the \emph{ori_proba_ratio} value. +#' +#' @param propagation [a boolean value, optional, FALSE by default] +#' +#' If set to FALSE, the skeleton is partially oriented with only the +#' v-structure orientations. Otherwise, the v-structure orientations are +#' propagated to downstream undirected edges in unshielded triples following +#' the propagation procedure, relying on probabilities (for more details, +#' see Verny \emph{et al.}, PLoS Comp. Bio. 2017). +#' +#' @param latent [a string, optional, "orientation" by default, possible +#' values: "orientation", "no", "yes"] +#' +#' When set to "yes", the network reconstruction is taking into account hidden +#' (latent) variables. When set to "orientation", latent variables are not +#' considered during the skeleton reconstruction but allows bi-directed edges +#' during the orientation. +#' Dependence between two observed variables due to a latent variable is +#' indicated with a '6' in the adjacency matrix and in the network +#' edges.summary and by a bi-directed edge in the (partially) oriented graph. +#' +#' @param n_eff [a positive integer, optional, -1 by default] +#' +#' In standard mode, the n samples given in the \emph{input_data} data frame are +#' expected to be independent. In case of correlated samples such as in +#' Monte Carlo sampling approaches, the effective number of independent samples +#' \emph{n_eff} can be estimated using the decay of the autocorrelation function +#' (Verny et al., PLoS Comp. Bio. 2017). This effective number \emph{n_eff} +#' of independent samples can be provided using this parameter. +#' +#' @param n_shuffles [a positive integer, optional, 0 by default] +#' +#' The number of shufflings of the original dataset in order to evaluate +#' the edge specific confidence ratio of all inferred edges. +#' Default is 0: no confidence cut is applied. If the number of shufflings +#' is set to an integer > 0, the confidence threshold must also be > 0 +#' (e.g. \emph{n_shuffles} = 100 and \emph{conf_threshold} = 0.01). +#' +#' @param conf_threshold [a positive floating point, optional, 0 by default] +#' +#' The threshold used to filter the less probable edges following the skeleton +#' step. See Verny \emph{et al.}, PLoS Comp. Bio. 2017. +#' Default is 0: no confidence cut is applied. If the the confidence threshold +#' is set > 0, the number of shufflings must also be > 0 +#' (e.g. \emph{n_shuffles} = 100 and \emph{conf_threshold} = 0.01). +#' +#' @param sample_weights [a numeric vector, optional, NULL by default] +#' +#' An vector containing the weight of each observation. If defined, it must be +#' a vector of floats in the range [0,1] of size equal to the number of samples. +#' +#' @param test_mar [a boolean value, optional, TRUE by default] +#' +#' If set to TRUE, distributions with missing values will be tested with +#' Kullback-Leibler divergence: conditioning variables for the given link +#' \eqn{X - Y}, \eqn{Z} will be considered only if the divergence +#' between the full distribution and the non-missing distribution +#' \eqn{KL(P(X,Y) | P(X,Y)_{!NA})} is low enough (with \eqn{P(X,Y)_{!NA}} as +#' the joint distribution of \eqn{X} and \eqn{Y} on samples which are +#' not missing on Z. #' This is a way to ensure that data are missing at random for the considered -#' interaction and to avoid selection bias. Set to TRUE by default -#' -#' @param consistent [a string; \emph{c("no", "orientation", "skeleton")}] -#' if "orientation": iterate over skeleton and orientation steps to ensure -#' consistency of the network; -#' if "skeleton": iterate over skeleton step to get a consistent skeleton, then -#' orient edges and discard inconsistent orientations to ensure consistency of -#' the network. See (Li \emph{et al.}, NeurIPS 2019) for details. -#' -#' @param max_iteration [a positive integer] When the \emph{consistent} parameter -#' is set to "skeleton" or "orientation", the maximum number of iterations -#' allowed when trying to find a consistent graph. Set to 100 by default. -#' -#' @param consensus_threshold [a floating point between 0.5 and 1.0] When the -#' \emph{consistent} parameter is set to "skeleton" or "orientation", and when -#' the result graph is inconsistent, or is a union of more than one inconsistent -#' graphs, a consensus graph will be produced based on a pool of graphs. If the -#' result graph is inconsistent, then the pool is made of [max_iteration] graphs -#' from the iterations, otherwise it is made of those graphs in the union. In -#' the consensus graph, an edge is present when the proportion of non-zero +#' interaction and to avoid selection bias. +#' +#' @param consistent [a string, optional, "no" by default, possible values: +#' "no", "orientation", "skeleton"] +#' +#' If set to "orientation": iterate over skeleton and orientation steps to +#' ensure consistency of the network. +#' If set to "skeleton": iterate over skeleton step to get a consistent skeleton, +#' then orient edges and discard inconsistent orientations to ensure consistency +#' of the network (see Li \emph{et al.}, NeurIPS 2019 for details). +#' +#' @param max_iteration [a positive integer, optional, 100 by default] +#' +#' When the \emph{consistent} parameter is set to "skeleton" or "orientation", +#' the maximum number of iterations allowed when trying to find a consistent +#' graph. +#' +#' @param consensus_threshold [a floating point between 0.5 and 1.0, optional, +#' 0.8 by default] +#' +#' When the \emph{consistent} parameter is set to "skeleton" or "orientation" +#' and when the result graph is inconsistent or is a union of more than +#' one inconsistent graphs, a consensus graph will be produced based on +#' a pool of graphs. +#' If the result graph is inconsistent, then the pool is made of +#' \emph{max_iteration} graphs from the iterations, otherwise it is made of +#' those graphs in the union. +#' In the consensus graph, an edge is present when the proportion of non-zero #' status in the pool is above the threshold. For example, if the pool contains #' [A, B, B, 0, 0], where "A", "B" are different status of the edge and "0" -#' indicates the absence of the edge. Then the edge is set to connected ("1") if -#' the proportion of non-zero status (0.6 in the example) is equal to or higher -#' than [consensus_threshold]. (When set to connected, the orientation of the -#' edge will be further determined by the average probability of orientation.) -#' Set to 0.8 by default. +#' indicates the absence of the edge. Then the edge is set to connected ("1") +#' if the proportion of non-zero status (0.6 in the example) is equal to +#' or higher than \emph{consensus_threshold}. (When set to connected, +#' the orientation of the edge will be further determined by the average +#' probability of orientation.) +#' +#' @param negative_info [a boolean value, optional, FALSE by default] +#' +#' For test purpose only. If TRUE, negative shifted mutual information is +#' allowed during the computation when mutual information is inferior to the +#' complexity term. +#' For small dataset with complicated structures, e.g. discrete variables with +#' many levels, allowing for negative shifted mutual information may help +#' identifying weak v-structures related to those discrete variables, +#' as the negative three-point information in those cases will come from +#' the difference between two negative shifted mutual information terms +#' (expected to be negative due to the small sample size). +#' However, under this setting, a v-structure (X -> Z <- Y) in the final graph +#' does not necessarily imply that X is dependent on Y conditioning on Z, +#' As a consequence, the interpretability of the final graph +#' is hindered. In practice, it's advised to keep this parameter as FALSE. #' -#' @param verbose [a boolean value] If TRUE, debugging output is printed. +#' @param mode [a string, optional, "S" by default, possible values are +#' "S": Standard (IID samples) or "TS": Temporal Stationary"] #' -#' @param n_threads [a positive integer] -#' When set greater than 1, n_threads parallel threads will be used for computation. Make sure -#' your compiler is compatible with openmp if you wish to use multithreading. +#' When temporal mode is activated, the time information must be provided +#' in the first column of \emph{input_data}. For more details about temporal +#' stationary mode, see Simon \emph{et al.}, eLife reviewed preprint 2024. #' -#' @param negative_info [a boolean value] For test purpose only. FALSE by -#' default. If TRUE, negative shifted mutual information is allowed during the -#' computation when mutual information is inferior to the complexity term. For -#' small dateset with complicated structures, e.g., discrete variables with many -#' levels, allowing for negative shifted mutual information may help identifying -#' weak v-structures related to those discrete variables, as the negative -#' three-point information in those cases will come from the difference between -#' two negative shifted mutual information terms (expected to be negative due to -#' the small sample size). However, under this setting, a v-structure (X -> Z <- -#' Y) in the final graph does not necessarily imply that X is dependent on Y -#' conditioning on Z, As a consequence, the interpretability of the final graph -#' is hindered. In practice, it's advised to keep this parameter as FALSE. +#' @param n_layers [an integer, optional, NULL by default, must be >= 2 +#' if supplied] +#' +#' Used only in temporal mode, \emph{n_layers} defines the number of layers +#' that will be considered for the variables in the time unfolded graph. +#' The layers will be distant of \emph{delta_t} time steps. +#' If not supplied, the number of layers is estimated from the dynamic of the +#' dataset and the maximum number of nodes \emph{max_nodes} allowed in the +#' final lagged graph. +#' +#' @param delta_t [an integer, optional, NULL by default, must be >= 1 +#' if supplied] +#' +#' Used only in temporal mode, \emph{delta_t} defines the number of time steps +#' between each layer. +#' i.e. on 1000 time steps with \emph{n_layers} = 3 and \emph{delta_t} = 7, +#' the time steps kept for the samples conversion will be 1, 8, 15 +#' for the first sample, the next sample will use 2, 9, 16 and so on. +#' If not supplied, the number of time steps between layers is estimated +#' from the dynamic of the dataset and the number of layers. +#' +#' @param movavg [an integer, optional, NULL by default, must be >= 2 +#' if supplied] +#' +#' Used only in temporal mode. When supplied, a moving average operation is +#' applied to all integer and numeric variables that are not contextual +#' variables. +#' +#' @param keep_max_data [a boolean value, optional, FALSE by default] +#' +#' Used only in temporal mode. If TRUE, rows where some NAs have been +#' introduced during the moving averages and lagging will be kept +#' whilst they will be dropped if FALSE. +#' +#' @param max_nodes [an integer, optional, 50 by default] +#' +#' Used only in temporal mode and if the \emph{n_layers} or \emph{delta_t} +#' parameters are not supplied. \emph{max_nodes} is used as the maximum number +#' of nodes in the final graph to compute \emph{n_layers} and/or \emph{delta_t}. +#' The default is 50 to produce quick runs and can be increased up to 200 +#' or 300 on recent computers to produce more precise results. +#' +#' @param verbose [a boolean value, optional, FALSE by default] +#' +#' If TRUE, debugging output is printed. #' #' @return A \emph{miic-like} object that contains: #' \itemize{ -#' \item{all.edges.summary:}{ a data frame with information about the relationship between +#' \item{\emph{all.edges.summary:} a data frame with information about the relationship between #' each pair of variables #' \itemize{ -#' \item \emph{x:} X node -#' \item \emph{y:} Y node -#' \item \emph{type:} contains 'N' if the edge has -#' been removed or 'P' for retained edges. If a true edges file is given, -#' 'P' becomes 'TP' (True Positive) or 'FP' (False Positive), while -#' 'N' becomes 'TN' (True Negative) or 'FN' (False Negative). -#' \item \emph{ai:} the contributing nodes found by the method which participate in -#' the mutual information between \emph{x} and \emph{y}, and possibly separate them. -#' \item \emph{info:} provides the pairwise mutual information times \emph{Nxyi} for -#' the pair (\emph{x}, \emph{y}). -#' \item \emph{info_cond:} provides the conditional mutual information times \emph{Nxy_ai} for -#' the pair (\emph{x}, \emph{y}) when conditioned on the collected nodes \emph{ai}. It is -#' equal to the \emph{info} column when \emph{ai} is an empty set. -#' \item \emph{cplx:} gives the computed complexity between the (\emph{x}, \emph{y}) -#' variables taking into account the contributing nodes \emph{ai}. Edges that have -#' have more conditional information \emph{info_cond} than \emph{cplx} are retained in the -#' final graph. -#' \item \emph{Nxy_ai:} gives the number of complete samples on which the information and -#' the complexity have been computed. If the input dataset has no missing value, the -#' number of samples is the same for all pairs and corresponds to the total -#' number of samples. -#' \item \emph{info_shifted:} represents the \emph{info} - \emph{cplx} value. -#' It is a way to quantify the strength of the edge (\emph{x}, \emph{y}). -#' \item \emph{confidenceRatio:} this column is present if the confidence cut -#' is > 0 and it represents the ratio between the probability to reject +#' \item{ \emph{x:} X node} +#' \item{ \emph{y:} Y node} +#' \item{ \emph{type:} contains 'N' if the edge has been removed or 'P' for +#' retained edges. If the true graph is supplied in the \emph{true_edges} +#' parameter, 'P' becomes 'TP' (True Positive) or 'FP' (False Positive), +#' while 'N' becomes 'TN' (True Negative) or 'FN' (False Negative). +#' Note that, as the \emph{all.edges.summary} does not contain all the +#' negative edges, edges not present are 'TN'.} +#' \item{ \emph{ai:} the contributing nodes found by the method which +#' participate in the mutual information between \emph{x} and \emph{y}, +#' and possibly separate them.} +#' \item{ \emph{raw_contributions:} describes the share of total mutual +#' information between \emph{x} and \emph{y} explained by each contributor.} +#' \item{ \emph{contributions:} describes the share of remaining mutual +#' information between \emph{x} and \emph{y} explained by each successive +#' contributors.} +#' \item{ \emph{info:} provides the pairwise mutual information times +#' \emph{Nxyi} for the pair (\emph{x}, \emph{y}).} +#' \item{ \emph{info_cond:} provides the conditional mutual information +#' times \emph{Nxy_ai} for the pair (\emph{x}, \emph{y}) when conditioned +#' on the collected nodes \emph{ai}. It is +#' equal to the \emph{info} column when \emph{ai} is an empty set.} +#' \item{ \emph{cplx:} gives the computed complexity between the (\emph{x}, +#' \emph{y}) variables taking into account the contributing nodes \emph{ai}. +#' Edges that have have more conditional information \emph{info_cond} +#' than \emph{cplx} are retained in the final graph.} +#' \item{ \emph{Nxy_ai:} gives the number of complete samples on which the +#' information and the complexity have been computed. If the input dataset +#' has no missing value, the number of samples is the same for all pairs +#' and corresponds to the total number of samples.} +#' \item{ \emph{info_shifted:} represents the \emph{info} - \emph{cplx} value. +#' It is a way to quantify the strength of the edge (\emph{x}, \emph{y}).} +#' \item{ \emph{infOrt:} the orientation of the edge (\emph{x}, \emph{y}). +#' It is the same value as in the adjacency matrix at row \emph{x} and +#' column \emph{y} : 1 for unoriented, 2 for an edge from X to Y, +#' -2 from Y to X and 6 for bidirectional.} +#' \item{ \emph{trueOrt:} the orientation of the edge (\emph{x}, \emph{y}) +#' present in the true edges are provided.} +#' \item{ \emph{isOrtOk:} information about the consistency of the inferred +#' graph’s orientations with a reference graph is given (if true edges +#' are provided). +#' 'Y': the orientation is consistent; 'N': the orientation is not consistent +#' with the PAG (Partial Ancestor Graph) derived from the given true graph.} +#' \item{ \emph{sign:} the sign of the partial correlation between variables +#' \emph{x} and \emph{y}, conditioned on the contributing nodes \emph{ai}.} +#' \item{ \emph{partial_correlation:} value of the partial correlation for the +#' edge (\emph{x}, \emph{y}) conditioned on the contributing nodes \emph{ai}.} +#' \item{ \emph{is_causal:} details about the nature of the arrow tip for a +#' directed edge. A directed edge in a causal graph does not necessarily imply +#' causation but it does imply that the cause-effect relationship is not the +#' other way around. An arrow-tip which is itself downstream of another +#' directed edge suggests stronger causal sense and is marked by a 'Y', +#' or 'N' otherwise.} +#' \item{ \emph{proba:} probabilities for the inferred orientation, derived +#' from the three-point mutual information (cf Affeldt & Isambert, UAI 2015 +#' proceedings) and noted as p(x->y);p(x<-y).} +#' \item{ \emph{confidence:} this column is computed when the confidence cut +#' is activated. It represents the ratio between the probability to reject #' the edge (\emph{x}, \emph{y}) in the dataset versus the mean probability -#' to do the same in multiple (user defined) number of randomized datasets. -#' \item \emph{infOrt:} the orientation of the edge (\emph{x}, \emph{y}). It is -#' the same value as in the adjacency matrix at row \emph{x} and column \emph{y} : 1 for -#' unoriented, 2 for an edge from X to Y, -2 from Y to X and 6 for bidirectional. -#' \item \emph{trueOrt:} the orientation of the edge (\emph{x}, \emph{y}) present -#' in the true edges file if provided. -#' \item \emph{isOrtOk:} information about the consistency of the inferred graph’s -#' orientations with a reference graph is given (i.e. if true edges file is provided). -#' Y: the orientation is consistent; N: the orientation is not consistent with -#' the PAG (Partial Ancestor Graph) derived from the given true graph. -#' \item \emph{sign:} the sign of the partial correlation between variables -#' \emph{x} and \emph{y}, conditioned on the contributing nodes \emph{ai}. -#' \item \emph{partial_correlation:} value of the partial correlation for the -#' edge (\emph{x}, \emph{y}) conditioned on the contributing nodes \emph{ai}. -#' \item \emph{isCausal:} details about the nature of the arrow tip for a directed -#' edge. A directed edge in a causal graph does not necessarily imply causation but it -#' does imply that the cause-effect relationship is not the other way around. An arrow-tip -#' which is itself downstream of another directed edge suggests stronger causal sense and is -#' marked by a 'Y', or 'N' otherwise. -#' \item \emph{proba:} probabilities for the inferred orientation, derived from the three-point -#' mutual information (cf Affeldt & Isambert, UAI 2015 proceedings) and noted as p(x->y);p(x<-y). +#' to do the same in multiple (user defined) number of randomized datasets.} #' } #' } #' -#' \item{orientations.prob:} {this data frame lists the orientation probabilities of the two edges of all unshielded triples -#' of the reconstructed network with the structure: node1 -- mid-node -- node2:} +#' \item{\emph{orientations.prob:} this data frame lists the orientation +#' probabilities of the two edges of all unshielded triples of the +#' reconstructed network with the structure: node1 -- mid-node -- node2: #' \itemize{ -#' \item node1: node at the end of the unshielded triplet -#' \item p1: probability of the arrowhead node1 <- mid-node -#' \item p2: probability of the arrowhead node1 -> mid-node -#' \item mid-node: node at the center of the unshielded triplet -#' \item p3: probability of the arrowhead mid-node <- node2 -#' \item p4: probability of the arrowhead mid-node -> node2 -#' \item node2: node at the end of the unshielded triplet -#' \item NI3: 3 point (conditional) mutual information * N +#' \item{ \emph{node1:} node at the end of the unshielded triplet} +#' \item{ \emph{p1:} probability of the arrowhead node1 <- mid-node} +#' \item{ \emph{p2:} probability of the arrowhead node1 -> mid-node} +#' \item{ \emph{mid-node:} node at the center of the unshielded triplet} +#' \item{ \emph{p3:} probability of the arrowhead mid-node <- node2} +#' \item{ \emph{p4:} probability of the arrowhead mid-node -> node2} +#' \item{ \emph{node2:} node at the end of the unshielded triplet} +#' \item{ \emph{NI3:} 3 point (conditional) mutual information * N} +#' \item{ \emph{Conflict:} indicates if there a conflict between the +#' computed probabilities and the NI3 value} +#' } #' } #' -#' \item {adj_matrix:} the adjacency matrix is a square matrix used to represent -#' the inferred graph. The entries of the matrix indicate whether pairs of -#' vertices are adjacent or not in the graph. The matrix can be read as a -#' (row, column) set of couples where the row represents the source node and -#' the column the target node. Since miic can reconstruct mixed networks +#' \item {\emph{adj_matrix:} the adjacency matrix is a square matrix used to +#' represent the inferred graph. The entries of the matrix indicate whether +#' pairs of vertices are adjacent or not in the graph. The matrix can be read +#' as a (row, column) set of couples where the row represents the source node +#' and the column the target node. Since miic can reconstruct mixed networks #' (including directed, undirected and bidirected edges), we will have a #' different digit for each case: #' \itemize{ -#' \item 1: (\emph{x}, \emph{y}) edge is undirected -#' \item 2: (\emph{x}, \emph{y}) edge is directed as \emph{x} -> \emph{y} -#' \item -2: (\emph{x}, \emph{y}) edge is directed as \emph{x} <- \emph{y} -#' \item 6: (\emph{x}, \emph{y}) edge is bidirected +#' \item{ 1: (\emph{x}, \emph{y}) edge is undirected} +#' \item{ 2: (\emph{x}, \emph{y}) edge is directed as \emph{x} -> \emph{y} } +#' \item{ -2: (\emph{x}, \emph{y}) edge is directed as \emph{x} <- \emph{y} } +#' \item{ 6: (\emph{x}, \emph{y}) edge is bidirected} +#' } #' } #' -#' \item {params:} the list of parameters used for the network reconstruction. -#' The parameters not supplied are initialized to their default values. -#' Otherwise, the parameters are checked and corrected if necessary. +#' \item {\emph{proba_adj_matrix:} the probability adjacency matrix is +#' a square matrix used to represent the orientation probabilities associated +#' to the edges tips. The value at ("row", "column") is the probability, +#' for the edge between "row" and "column" nodes, of the edge tip on the "row" +#' side. A probability less than 0.5 is an indication of a possible tail +#' (cause) and a probability greater than 0.5 a possible head (effect). +#' } +#' +#' \item {\emph{params:} the list of parameters used for the network +#' reconstruction. The parameters not supplied are initialized to their default +#' values. Otherwise, the parameters are checked and corrected if necessary.} #' -#' \item {state_order:} the state order used for the network reconstruction. -#' If no state order is supplied, it is generated by using default values. -#' Otherwise, it is the state order checked and corrected if necessary. +#' \item {\emph{state_order:} the state order used for the network +#' reconstruction. If no state order is supplied, it is generated by using +#' default values. Otherwise, it is the state order checked and corrected +#' if necessary.} #' -#' \item {black_box:} present only if a black box has been supplied: +#' \item {\emph{black_box:} present only if a black box has been supplied, #' the black box, checked and corrected if necessary, used for the network -#' reconstruction. +#' reconstruction.} #' -#' \item {true_edges:} present only if the true edges have been supplied, +#' \item {\emph{true_edges:} present only if the true edges have been supplied, #' the true edges, checked and corrected if necessary, used for the network -#' evaluation. +#' evaluation.} +#' +#' \item {\emph{tmiic:} present only in temporal mode. +#' Named list containing the full list of edges completed by stationarity, +#' the lagged state order and, if a black box or true edges have been supplied, +#' the lagged versions of these inputs.} #' } +#' #' @export #' @useDynLib miic #' @import Rcpp @@ -360,6 +553,27 @@ #' # write graph to graphml format. Note that to correctly visualize #' # the network we created the miic style for Cytoscape (http://www.cytoscape.org/). #' miic.write.network.cytoscape(g = miic.res, file = file.path(tempdir(), "temp")) +#' +#' # EXAMPLE COVID CASES (time series demo) +#' data(covidCases) +#' # execute MIIC (reconstruct graph in temporal mode) +#' tmiic.res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) +#' +#' # to plot the default graph (compact) +#' if(require(igraph)) { +#' plot(tmiic.res) +#' } +#' +#' # to plot the raw temporal network Using igraph +#' if(require(igraph)) { +#' plot(tmiic.res, display="raw") +#' } +#' +#' # to plot the full temporal network Using igraph +#' if(require(igraph)) { +#' plot(tmiic.res, display="lagged") +#' } +#' #' } #' miic <- function(input_data, @@ -382,36 +596,110 @@ miic <- function(input_data, max_iteration = 100, consensus_threshold = 0.8, negative_info = FALSE, - verbose = FALSE) { + mode = "S", + n_layers = NULL, + delta_t = NULL, + movavg = NULL, + keep_max_data = FALSE, + max_nodes = 50, + verbose = FALSE) + { if (verbose) - cat("START miic...\n") + miic_msg ("Start MIIC...") + if ( is.null(mode) || ( ! (mode %in% MIIC_VALID_MODES) ) ) + miic_error ("parameters check", "invalid mode ", mode, + ". Possible modes are S (Standard), TS (Temporal Stationnary).") + if (mode %in% MIIC_TEMPORAL_MODES) + miic_msg ("Using temporal mode of MIIC") + # + # Check base inputs # - # Check inputs + input_data = check_input_data (input_data, mode) + params = check_parameters (input_data = input_data, + n_threads = n_threads, + cplx = cplx, + orientation = orientation, + ori_proba_ratio = ori_proba_ratio, + ori_consensus_ratio = ori_consensus_ratio, + propagation = propagation, + latent = latent, + n_eff = n_eff, + n_shuffles = n_shuffles, + conf_threshold = conf_threshold, + sample_weights = sample_weights, + test_mar = test_mar, + consistent = consistent, + max_iteration = max_iteration, + consensus_threshold = consensus_threshold, + mode = mode, + negative_info = negative_info, + verbose = verbose) + state_order = check_state_order (input_data, state_order, params$mode) + black_box = check_other_df (input_data, state_order, + black_box, "black box", params$mode) + true_edges = check_other_df (input_data, state_order, + true_edges, "true edges", params$mode) # - input_data = check_input_data (input_data) - list_params = check_parameters (input_data = input_data, - n_threads = n_threads, - cplx = cplx, - orientation = orientation, - ori_proba_ratio = ori_proba_ratio, - ori_consensus_ratio = ori_consensus_ratio, - propagation = propagation, - latent = latent, - n_eff = n_eff, - n_shuffles = n_shuffles, - conf_threshold = conf_threshold, - sample_weights = sample_weights, - test_mar = test_mar, - consistent = consistent, - max_iteration = max_iteration, - consensus_threshold = consensus_threshold, - negative_info = negative_info, - verbose = verbose) - state_order = check_state_order (input_data, state_order) - black_box = check_other_df (input_data, black_box, "black box") - true_edges = check_other_df (input_data, true_edges, "true edges") + # Extra steps depending on the mode # - # Extra changes on data using the state order information + if (! (mode %in% MIIC_TEMPORAL_MODES) ) + non_lagged_state_order = NULL + else + { + # Check temporal parameters and state_order + # + state_order = tmiic_check_state_order_part1 (state_order) + list_ret = tmiic_check_parameters (state_order = state_order, + params = params, + n_layers = n_layers, + delta_t = delta_t, + movavg = movavg, + keep_max_data = keep_max_data, + max_nodes = max_nodes) + params = list_ret$params + state_order = tmiic_check_state_order_part2 (list_ret$state_order) + list_ts = tmiic_extract_trajectories (input_data) + list_ts = tmiic_movavg (list_ts, state_order$movavg, + keep_max_data=params$keep_max_data, + verbose_level=ifelse (params$verbose, 2, 1) ) + state_order = tmiic_estimate_dynamic (list_ts, state_order, + max_nodes=params$max_nodes, + verbose_level=ifelse (params$verbose, 2, 1) ) + # + # Lag data and other inputs accordingly + # + non_lagged_state_order = state_order + non_lagged_true_edges = true_edges + non_lagged_black_box = black_box + state_order = tmiic_lag_state_order (non_lagged_state_order) + true_edges = tmiic_lag_other_df (non_lagged_state_order, true_edges) + true_edges = tmiic_check_other_df_after_lagging (state_order$var_names, + true_edges, "true edges") + black_box = tmiic_lag_other_df (non_lagged_state_order, black_box) + black_box = tmiic_check_other_df_after_lagging (state_order$var_names, + black_box, "black box") + list_ts = tmiic_lag_input_data (list_ts, state_order, + keep_max_data=params$keep_max_data) + input_data = tmiic_group_trajectories (list_ts) + # + # Check number of unique values per variable and review discrete/continuous + # after lagging as some columns may have less number of unique values + # + state_order = tmiic_check_after_lagging (input_data, state_order) + # + # Adjust n_eff if delta_t > 1 and no eff supplied by the user + # + avg_delta_t = mean (state_order$delta_t[state_order$is_contextual == 0]) + if ( (avg_delta_t > 1) && (params$n_eff == -1) ) + { + params$n_eff = trunc (nrow (input_data) / avg_delta_t) + miic_msg ("Note : the n_eff has been set to ", params$n_eff, + " (nb lagged samples= ", nrow (input_data), + " / delta_t=", round(avg_delta_t, 2), ").") + } + } + # + # Convert discrete vars as factors # for ( i in 1:nrow(state_order) ) if (state_order[i, "var_type"] == 0) @@ -420,49 +708,81 @@ miic <- function(input_data, # Call C++ reconstruction # if (verbose) - cat("\t# -> START reconstruction...\n") + miic_msg ("-> Start reconstruction...") res <- miic.reconstruct (input_data = input_data, - n_threads = list_params$n_threads, - cplx = list_params$cplx, - latent = list_params$latent, - n_eff = list_params$n_eff, + n_threads = params$n_threads, + cplx = params$cplx, + latent = params$latent, + n_eff = params$n_eff, black_box = black_box, - n_shuffles = list_params$n_shuffles, - orientation = list_params$orientation, - ori_proba_ratio = list_params$ori_proba_ratio, - propagation = list_params$propagation, - conf_threshold = list_params$conf_threshold, - verbose = list_params$verbose, + n_shuffles = params$n_shuffles, + orientation = params$orientation, + ori_proba_ratio = params$ori_proba_ratio, + propagation = params$propagation, + conf_threshold = params$conf_threshold, + verbose = params$verbose, is_contextual = state_order$is_contextual, is_consequence = state_order$is_consequence, is_continuous = state_order$var_type, - sample_weights = list_params$sample_weights, - test_mar = list_params$test_mar, - consistent = list_params$consistent, - max_iteration = list_params$max_iteration, - negative_info = list_params$negative_info) + sample_weights = params$sample_weights, + test_mar = params$test_mar, + consistent = params$consistent, + mode = params$mode, + n_layers = non_lagged_state_order$n_layers, + delta_t = non_lagged_state_order$delta_t, + max_iteration = params$max_iteration, + negative_info = params$negative_info) if (res$interrupted) stop("Interupted by user") if (verbose) - cat("\t# -> END reconstruction...\n\t# --------\n") + miic_msg ("-> End reconstruction...") # - # Post-traitement + # Post-traitment # res$all.edges.summary <- summarizeResults ( observations = input_data, results = res, true_edges = true_edges, state_order = state_order, - consensus_threshold = list_params$consensus_threshold, - ori_consensus_ratio = list_params$ori_consensus_ratio, - latent = (list_params$latent != "no"), - propagation = list_params$propagation, - verbose = list_params$verbose) + consensus_threshold = params$consensus_threshold, + ori_consensus_ratio = params$ori_consensus_ratio, + latent = (params$latent != "no"), + propagation = params$propagation, + verbose = params$verbose) + + res$params = params + if (! (mode %in% MIIC_TEMPORAL_MODES) ) + { + class(res) <- "miic" + res$state_order = state_order + res$black_box = black_box + res$true_edges = true_edges + } + else + { + class(res) <- "tmiic" + # + # clean state_order structure to remove extra columns used internally + # + non_lagged_state_order = non_lagged_state_order[, + colnames(non_lagged_state_order) %in% STATE_ORDER_TEMPORAL_VALID_COLUMNS] + res$state_order = non_lagged_state_order + res$black_box = non_lagged_black_box + res$true_edges = non_lagged_true_edges - res$params = list_params - res$state_order = state_order - res$black_box = black_box - res$true_edges = true_edges - class(res) <- "miic" + state_order = state_order[, + colnames(state_order) %in% STATE_ORDER_TEMPORAL_VALID_COLUMNS] + # + # The output of the reconstruction is the "raw" temporal graph, without + # edges identical by stationarity. To have the "real" temporal graph, + # we duplicate the edges using the stationary assumption and this "real" + # graph is stored the "all.edges.stationarity" data frame. + # + edges_dup_stat = tmiic_repeat_edges_over_history (res) + res$tmiic <- list (lagged_state_order = state_order, + lagged_black_box = black_box, + lagged_true_edges = true_edges, + all.edges.stationarity = edges_dup_stat) + } return(res) -} + } diff --git a/R/miic.plot.R b/R/miic.plot.R index ab19c6f3..8d94cfab 100644 --- a/R/miic.plot.R +++ b/R/miic.plot.R @@ -188,9 +188,6 @@ getIgraph <- function(miic.res, pcor_palette = NULL) { #' \code{\link[igraph]{igraph.plotting}} #' plot.miic = function(x, method = 'igraph', pcor_palette = NULL, ...) { - if (class(x) != "miic"){ - stop("Not a miic object.") - } if (method == 'igraph'){ if (base::requireNamespace("igraph", quietly = TRUE)) { igraph_obj = miic.export (x, 'igraph', pcor_palette = pcor_palette) diff --git a/R/miic.reconstruct.R b/R/miic.reconstruct.R index d64d4719..419dc441 100755 --- a/R/miic.reconstruct.R +++ b/R/miic.reconstruct.R @@ -7,17 +7,20 @@ miic.reconstruct <- function(input_data = NULL, n_eff = -1, cplx = "nml", eta = 1, - latent = "no", + latent = "orientation", n_shuffles = 0, orientation = TRUE, ori_proba_ratio = 1, - propagation = TRUE, + propagation = FALSE, conf_threshold = 0, verbose = FALSE, sample_weights = NULL, test_mar = TRUE, consistent = "no", max_iteration = NULL, + mode = "S", + n_layers = NULL, + delta_t = NULL, negative_info = FALSE ) { n_samples <- nrow(input_data) @@ -71,6 +74,7 @@ miic.reconstruct <- function(input_data = NULL, "ori_proba_ratio" = ori_proba_ratio, "propagation" = propagation, "test_mar" = test_mar, + "mode" = mode, "negative_info" = negative_info, "max_bins" = min(50, n_samples), "var_names" = var_names, @@ -83,23 +87,23 @@ miic.reconstruct <- function(input_data = NULL, black_box[] <- black_box[stats::complete.cases(black_box),] arg_list[["black_box"]] <- as.vector(as.matrix(t(black_box))) } - if (!is.null(sample_weights)) { + if (!is.null(sample_weights)) arg_list[["sample_weights"]] <- sample_weights - } - if (!is.null(is_contextual)) { + if (!is.null(is_contextual)) arg_list[["is_contextual"]] <- is_contextual - } - if (!is.null(is_consequence)) { + if (!is.null(is_consequence)) arg_list[["is_consequence"]] <- is_consequence - } + if (!is.null(n_layers)) + arg_list[["n_layers"]] <- n_layers + if (!is.null(delta_t)) + arg_list[["delta_t"]] <- delta_t cpp_input <- list("factor" = input_factor, "double" = input_double, "order" = input_order) # Call C++ function res <- reconstruct(cpp_input, arg_list) - if (res$interrupted) { + if (res$interrupted) return(list(interrupted = TRUE)) - } # R-formalize returned object # table of edges infomation @@ -130,10 +134,9 @@ miic.reconstruct <- function(input_data = NULL, rownames(res$proba_adj_matrix) <- var_names # adj_matrices (when consistent parameter is turned on) - if (length(res$adj_matrices) > 0) { + if (length(res$adj_matrices) > 0) res$adj_matrices <- matrix(unlist(res$adj_matrices), ncol = length(res$adj_matrices)) - } # proba_adj_matrices (when consistent parameter is turned on) if (length(res$proba_adj_matrices) > 0) { diff --git a/R/miic.utils.R b/R/miic.utils.R index 8b5dbeef..bbbf4e1b 100755 --- a/R/miic.utils.R +++ b/R/miic.utils.R @@ -1,13 +1,19 @@ #=============================================================================== # CONSTANTS #=============================================================================== +MIIC_VALID_MODES = c ("S", "TS") +MIIC_TEMPORAL_MODES = c ("TS") + MIIC_VALID_LATENT <- c ("orientation", "yes", "no") MIIC_VALID_CONSISTENT <- c ("no", "orientation", "skeleton") MIIC_CONTINUOUS_TRESHOLD <- 5 -STATE_ORDER_VALID_COLUMS <- c ("var_names", "var_type", - "levels_increasing_order", "is_contextual", "is_consequence") +STATE_ORDER_STANDARD_VALID_COLUMS <- c ("var_names", "var_type", + "levels_increasing_order", "is_contextual", "is_consequence", + "group", "group_color") +STATE_ORDER_TEMPORAL_VALID_COLUMNS = c (STATE_ORDER_STANDARD_VALID_COLUMS, + "n_layers", "delta_t", "movavg") #=============================================================================== # FUNCTIONS @@ -32,6 +38,16 @@ list_to_str <- function (list, n_max=NULL) { return (ret) } +#------------------------------------------------------------------------------- +# miic_error +#------------------------------------------------------------------------------- +# Utility function to raise an error and stop +#------------------------------------------------------------------------------- +miic_error <- function (context, ...) + { + stop (paste0 ("Error in ", context, ": ", ...), call.=FALSE) + } + #------------------------------------------------------------------------------- # miic_warning #------------------------------------------------------------------------------- @@ -41,33 +57,78 @@ miic_warning <- function (context, ...) { warning (paste0 ("Warning in ", context, ": ", ...), call.=FALSE) } +#------------------------------------------------------------------------------- +# miic_msg +#------------------------------------------------------------------------------- +# Utility function to display a message +#------------------------------------------------------------------------------- +miic_msg <- function (...) + { + cat (paste0 (..., "\n") ) + } + #------------------------------------------------------------------------------- # Check input data #------------------------------------------------------------------------------- -# input_data: a dataframe with variables as columns and rows as samples +# params: +# - input_data: a dataframe with variables as columns and rows as samples +# - mode : MIIC mode +# return: +# - input_data: the input data dataframe, eventually without full of NAs rows #------------------------------------------------------------------------------- -check_input_data <- function (input_data) { +check_input_data <- function (input_data, mode) + { if ( is.null(input_data) ) - stop ("The input data is required.", call.=FALSE) + miic_error ("input data", "The input data is required.") if ( ! is.data.frame (input_data) ) - stop ("The input data must be a dataframe.", call.=FALSE) - if (nrow (input_data) == 0) - stop ("The input data is empty.", call.=FALSE) - if (ncol (input_data) == 0) - stop ("The input data has no variable.", call.=FALSE) - rows_only_na <- rowSums (is.na (input_data)) == ncol (input_data) - input_data <- input_data [!rows_only_na, ] + miic_error ("input data", "The input data must be a dataframe.") if (nrow (input_data) == 0) - stop ("The input data contains only NAs", call.=FALSE) - if ( any (rows_only_na) ) - miic_warning ("input data", "the input data contains ", sum(rows_only_na), - " row(s) with only NAs. These row(s) will be removed.") - - n_unique_vals <- unlist (lapply (input_data, function (x) { - length (unique (x[!is.na(x)] ) ) } ) ) + miic_error ("input data", "The input data is empty.") + if ( (ncol (input_data) == 0) + || ( (mode %in% MIIC_TEMPORAL_MODES) && (ncol (input_data) == 1) ) ) + miic_error ("input data", "The input data has no variable.") + # + # Check variables full of NAs + # + cols_only_na <- colSums (is.na (input_data)) == nrow (input_data) + input_data <- input_data [, !cols_only_na] + if ( (ncol (input_data) == 0) + || ( (mode %in% MIIC_TEMPORAL_MODES) && (ncol (input_data) == 1) ) ) + miic_error ("input data", "The input data contains only NAs") + if ( any (cols_only_na) ) + miic_warning ("input data", "the input data contains ", sum(cols_only_na), + " variables(s) with only NAs. These variables(s) will be removed.") + # + # Check about rows with only NAs (only if mode is nottemporal, as NAs check + # in temporal mode will be done after data lagging) + # + if ( ! (mode %in% MIIC_TEMPORAL_MODES) ) + { + rows_only_na <- rowSums (is.na (input_data)) == ncol (input_data) + input_data <- input_data [!rows_only_na, ] + if ( any (rows_only_na) ) + miic_warning ("input data", "the input data contains ", sum(rows_only_na), + " row(s) with only NAs. These row(s) will be removed.") + } + # + # Check constant variables + # + if (mode %in% MIIC_TEMPORAL_MODES) + { + col_names = colnames (input_data[,2:ncol(input_data)]) + n_unique_vals <- unlist (lapply (input_data[,2:ncol(input_data)], + function (x) { length (unique (x[!is.na(x)] ) ) } ) ) + } + else + { + col_names = colnames (input_data) + n_unique_vals <- unlist (lapply (input_data, + function (x) { length (unique (x[!is.na(x)] ) ) } ) ) + } vars_constant = (n_unique_vals <= 1) - if ( any (vars_constant) ) { - msg_str <- list_to_str (colnames (input_data)[vars_constant], n_max=10) + if ( any (vars_constant) ) + { + msg_str <- list_to_str (col_names[vars_constant], n_max=10) if (sum (vars_constant) == 1) miic_warning ("input data", "the variable ", msg_str, " is constant.", " Such variable can not be connected and should be removed.") @@ -75,9 +136,9 @@ check_input_data <- function (input_data) { miic_warning ("input data", sum(vars_constant), " variables (", msg_str, ") are constant. Such variables can not be connected", " and should be removed.") - } + } return (input_data) -} + } #------------------------------------------------------------------------------- # check_state_order @@ -100,9 +161,18 @@ check_input_data <- function (input_data) { # can be NA or the full ordered list of the unique values. (default NA) # * is_contextual: 0=not contextual, 1=contextual (default 0) # * is_consequence: 0=not consequence, 1=consequence (default 0) +# additional possible columns in temporal mode are: +# * n_layers: the number of layers in the time unfolded graph +# * delta_t: the number of time steps between layers +# * movavg: if a moving average must applied on some variables +# NB: is_consequence is not allowed in temporal mode +# - mode: the MIIC mode # Return: the checked and eventually generated or completed state order dataframe #------------------------------------------------------------------------------- -check_state_order <- function (input_data, state_order) { +check_state_order <- function (input_data, state_order, mode) + { + if (mode %in% MIIC_TEMPORAL_MODES) + input_data <- input_data[,2:ncol(input_data)] data_var_names <- colnames (input_data) n_vars <- length (data_var_names) # @@ -110,23 +180,37 @@ check_state_order <- function (input_data, state_order) { # if ( is.null (state_order) ) state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F) - if ( ! is.data.frame(state_order) ) { + if ( ! is.data.frame(state_order) ) + { miic_warning ("state order", "the supplied state_order is not a dataframe and will be ignored.") state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F) - } - - if ( ! ("var_names" %in% colnames (state_order)) ) { + } + # + # Factors lead to wrong test results + # + factor_cols <- which (unlist (lapply (state_order, is.factor) ) ) + for (i in factor_cols) + state_order[,i] <- as.character (state_order[,i]) + # + # Check content + # + if ( ! ("var_names" %in% colnames (state_order)) ) + { miic_warning ("state order", "the column var_names is missing,", " the supplied state_order will be ignored.") - state_order <- data.frame ("var_names"=data_var_names) - } + state_order <- data.frame ("var_names"=data_var_names, stringsAsFactors=F) + } # # Check if the state_order columns are valid # - valid_cols <- STATE_ORDER_VALID_COLUMS + if (mode %in% MIIC_TEMPORAL_MODES) + valid_cols <- STATE_ORDER_TEMPORAL_VALID_COLUMNS + else + valid_cols <- STATE_ORDER_STANDARD_VALID_COLUMS mismatch <- is.na (match (colnames (state_order), valid_cols)) - if ( any (mismatch) ) { + if ( any (mismatch) ) + { msg_str <- list_to_str (colnames (state_order)[mismatch], n_max=10) if (sum (mismatch) == 1) miic_warning ("state order", "the column ", msg_str, @@ -135,7 +219,7 @@ check_state_order <- function (input_data, state_order) { miic_warning ("state order", sum (mismatch), " columns (", msg_str, ") are not valid and will be ignored.") state_order <- state_order[, !mismatch, drop=FALSE] - } + } # # We ensure that the var_names column is the first # @@ -147,7 +231,8 @@ check_state_order <- function (input_data, state_order) { # Check variables in state_order not in data # mismatch <- is.na (match (state_order$var_names, data_var_names)) - if ( any (mismatch) ) { + if ( any (mismatch) ) + { msg_str <- list_to_str (state_order$var_names[mismatch], n_max=10) if (sum (mismatch) == 1) miic_warning ("state order", "the variable ", msg_str, @@ -156,11 +241,12 @@ check_state_order <- function (input_data, state_order) { miic_warning ("state order", sum (mismatch), " variables (", msg_str, ") do not match any name in input data and will be ignored.") state_order <- state_order[!mismatch, ] - } + } # # Before checking variables in data not in the state_order # if var_type, is_contextual or is_consequence are present, we flag NA # in these columns as "NA" to be able to display correct warnings later. + # The same applies for the specific columns of the temporal modes. # ( !! this change the column type to character, even if no NA is detected !! ) # if ("var_type" %in% colnames (state_order) ) @@ -169,11 +255,21 @@ check_state_order <- function (input_data, state_order) { state_order$is_contextual[ is.na (state_order$is_contextual) ] <- "NA" if ("is_consequence" %in% colnames (state_order) ) state_order$is_consequence[ is.na (state_order$is_consequence) ] <- "NA" + if (mode %in% MIIC_TEMPORAL_MODES) + { + if ("n_layers" %in% colnames (state_order) ) + state_order$n_layers[ is.na (state_order$n_layers) ] <- "NA" + if ("delta_t" %in% colnames (state_order) ) + state_order$delta_t[ is.na (state_order$delta_t) ] <- "NA" + if ("movavg" %in% colnames (state_order) ) + state_order$movavg[ is.na (state_order$movavg) ] <- "NA" + } # # Check variable in data not in the state_order # not_found <- is.na (match (data_var_names, state_order$var_names)) - if ( any (not_found) ) { + if ( any (not_found) ) + { msg_str <- list_to_str (data_var_names[not_found], n_max=10) if ( sum (not_found) == 1) miic_warning ("state order", "the variables ", msg_str, @@ -189,7 +285,7 @@ check_state_order <- function (input_data, state_order) { na_vals <- rep (NA, ncol(state_order) - 1) for (i in which (not_found)) state_order[nrow(state_order)+1,] <- c(data_var_names[i], na_vals) - } + } # # The state_order rows are ordered as the variables in the data # @@ -203,21 +299,25 @@ check_state_order <- function (input_data, state_order) { n_unique_vals <- unlist (lapply (input_data, function (x) { length (unique (x[!is.na(x)] ) ) } ) ) var_type_specified <- rep (F, n_vars) - if ( ! ("var_type" %in% colnames (state_order) ) ) { + if ( ! ("var_type" %in% colnames (state_order) ) ) + { state_order$var_type <- as.integer (data_is_num) # # Continuous Variables with less than MIIC_CONTINUOUS_TRESHOLD are # considered as discrete # state_order$var_type [ n_unique_vals < MIIC_CONTINUOUS_TRESHOLD ] = 0 - } else { + } + else + { var_type_specified <- rep (T, n_vars) # # Exclude NAs from the warning (NA = row added because var name missing) # non_valid <- ( ( ! (is.na (state_order$var_type) ) ) & ( ! (state_order$var_type %in% c(0,1)) ) ) - if ( any (non_valid) ) { + if ( any (non_valid) ) + { msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10) if ( sum (non_valid) == 1) miic_warning ("state order", "the variable ", msg_str, @@ -227,15 +327,16 @@ check_state_order <- function (input_data, state_order) { miic_warning ("state order", sum(non_valid), " variables (", msg_str, ") do not have a valid value in the var_type column,", " the invalid values will be ignored and types determined from data.") - } + } # # All non 0 or 1 need to be fixed # non_valid <- ! (state_order$var_type %in% c(0,1)) - if ( any (non_valid) ) { + if ( any (non_valid) ) + { state_order$var_type[non_valid] <- as.integer(data_is_num)[non_valid] var_type_specified[non_valid] <- F - } + } # # Ensure the type of var_type is numerical # (because when looking for NAs present before, the column type has been @@ -246,7 +347,8 @@ check_state_order <- function (input_data, state_order) { # Check var_type against data # pb_continuous <- (state_order$var_type == 1) & (!data_is_num) - if ( any (pb_continuous) ) { + if ( any (pb_continuous) ) + { msg_str <- list_to_str (state_order$var_names[pb_continuous], n_max=10) if ( sum (pb_continuous) == 1) miic_warning ("state order", "the variable ", msg_str, @@ -257,23 +359,31 @@ check_state_order <- function (input_data, state_order) { ") are declared continuous in the var_type column but these variables", " are not numeric. These variables will be considered as discrete.") state_order$var_type[pb_continuous] <- 0 + } + # + # In temporal mode, we store if var_type was specified by the user for a + # future use + # + if (mode %in% MIIC_TEMPORAL_MODES) + state_order$var_type_specified <- var_type_specified } - } # # Check the number of unique values versus var_type # - for (i in 1:n_vars) { - if (state_order[i, "var_type"] == 1) { # Continuous - # + for (i in 1:n_vars) + { + if (state_order[i, "var_type"] == 1) # Continuous + { # Less than 3 unique values does not make sense for a continuous variable # - if (n_unique_vals[[i]] <= 2) { + if (n_unique_vals[[i]] <= 2) + { if (var_type_specified[[i]]) miic_warning ("state order", "variable ", data_var_names[[i]], " specified as continuous has only ", n_unique_vals[[i]], " non-NA unique values. It will be processed as discrete.") state_order$var_type[[i]] <- 0 - } + } # # Less than MIIC_CONTINUOUS_TRESHOLD unique variables can be discretized # but may not be truly continuous @@ -282,24 +392,27 @@ check_state_order <- function (input_data, state_order) { miic_warning ("state order", "numerical variable ", data_var_names[[i]], " is treated as continuous but has only ", n_unique_vals[[i]], " non-NA unique values.") - } else { # discrete var + } + else # discrete var + { if ( data_is_num[[i]] && (n_unique_vals[[i]] >= MIIC_CONTINUOUS_TRESHOLD * 2) ) miic_warning ("state order", "numerical variable ", data_var_names[[i]], " is treated as discrete but has ", n_unique_vals[[i]], " levels.") + } } - } # # is_contextual # if ( ! ("is_contextual" %in% colnames (state_order) ) ) state_order$is_contextual <- rep (0, n_vars) - else { - # + else + { # Exclude NAs from the warning (NA = row added because var name missing) # non_valid <- ( ( ! (is.na (state_order$is_contextual) ) ) & ( ! (state_order$is_contextual %in% c(0,1)) ) ) - if (any (non_valid)) { + if (any (non_valid)) + { msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10) if (sum (non_valid) == 1) miic_warning ("state order", "the variable ", msg_str, @@ -309,7 +422,7 @@ check_state_order <- function (input_data, state_order) { miic_warning ("state order", sum (non_valid), " variables (", msg_str, ") do not have a valid value in the is_contextual column,", " these variables will be considered as not contextual.") - } + } # # All non 0 or 1 are not valid => set to not contextual # @@ -326,71 +439,103 @@ check_state_order <- function (input_data, state_order) { # Stop if all variables are contextual # if (all (state_order$is_contextual == 1)) - stop (paste0 ("All variables have been defined as contextual in the", - " state_order. No network can be infered with these settings."), call.=FALSE) - } + miic_error ("state order", "All variables have been defined as", + " contextual. No network can be infered with these settings.") + } # # is_consequence # if ( ! ("is_consequence" %in% colnames (state_order) ) ) state_order$is_consequence <- rep (0, n_vars) - else { - # - # Exclude NAs from warnings (NA = row added because var name missing) - # - non_valid <- ( ( ! (is.na (state_order$is_consequence) ) ) - & ( ! (state_order$is_consequence %in% c(0,1)) ) ) - if (any (non_valid)) { - msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10) - if (sum (non_valid) == 1) - miic_warning ("state order", "the variable ", msg_str, - " does not have a valid value in the is_consequence column,", - " this variable will be considered as not consequence") - else - miic_warning ("state order", sum (non_valid), " variables (", msg_str, - ") do not have a valid value in the is_consequence column,", - " these variables will be considered as not consequence") + else + { + if (mode %in% MIIC_TEMPORAL_MODES) + { + # Exclude NAs from warnings (NA = row added because var name missing) + # => Look of anything not NA and != 0 + # + conseq_def <- ( ( ! (is.na (state_order$is_consequence) ) ) + & ( is.null (state_order$is_consequence) + | (state_order$is_consequence != 0) ) ) + if (any (conseq_def)) + { + msg_str <- list_to_str (state_order$var_names[conseq_def], n_max=10) + if (sum (conseq_def) == 1) + miic_warning ("state order", "the variable ", msg_str, + " is defined as consequence but consequence prior is not compatible", + " with temporal mode. This variable will be considered as not", + " consequence") + else + miic_warning ("state order", sum (conseq_def), " variables (", msg_str, + ") are defined as consequence but consequence prior is not compatible", + " with temporal mode. These variables will be considered as not", + " consequence") + state_order$is_consequence <- rep (0, n_vars) + } + } + else # Not temporal mode + { + # Exclude NAs from warnings (NA = row added because var name missing) + # + non_valid <- ( ( ! (is.na (state_order$is_consequence) ) ) + & ( ! (state_order$is_consequence %in% c(0,1)) ) ) + if (any (non_valid)) + { + msg_str <- list_to_str (state_order$var_names[non_valid], n_max=10) + if (sum (non_valid) == 1) + miic_warning ("state order", "the variable ", msg_str, + " does not have a valid value in the is_consequence column,", + " this variable will be considered as not consequence") + else + miic_warning ("state order", sum (non_valid), " variables (", msg_str, + ") do not have a valid value in the is_consequence column,", + " these variables will be considered as not consequence") + } + # + # All non 0 or 1 are not valid => set to not consequence + # + non_valid <- ! (state_order$is_consequence %in% c(0,1)) + if (any (non_valid)) + state_order$is_consequence[non_valid] <- 0 + # + # Ensure the type of is_consequence is numerical + # (because when looking for NAs present before, the column type has been + # shifted to character. Now, we are sure that we have only O and 1 => as.int + # + state_order$is_consequence = as.integer(state_order$is_consequence) + # + # Stop if all variables are consequences + # + if (all (state_order$is_consequence == 1)) + miic_error ("state order", "All variables have been defined as", + " consequences. No network can be infered with these settings.") + } } - # - # All non 0 or 1 are not valid => set to not consequence - # - non_valid <- ! (state_order$is_consequence %in% c(0,1)) - if (any (non_valid)) - state_order$is_consequence[non_valid] <- 0 - # - # Ensure the type of is_consequence is numerical - # (because when looking for NAs present before, the column type has been - # shifted to character. Now, we are sure that we have only O and 1 => as.int - # - state_order$is_consequence = as.integer(state_order$is_consequence) - # - # Stop if all variables are consequences - # - if (all (state_order$is_consequence == 1)) - stop (paste0 ("All variables have been defined as consequences in the", - " state_order. No network can be infered with these settings."), call.=FALSE) - } # # levels_increasing_order # if ( ! ("levels_increasing_order" %in% colnames (state_order) ) ) state_order$levels_increasing_order <- NA - else { - for (i in 1:n_vars) { + else + { + for (i in 1:n_vars) + { order_str <- state_order[i, "levels_increasing_order"] if ( is.na (order_str) ) next - if (order_str == "") { + if (order_str == "") + { state_order[i, "levels_increasing_order"] <- NA next - } - if (state_order[i, "var_type"] == 1) { + } + if (state_order[i, "var_type"] == 1) + { miic_warning ("state order", "variable ", state_order[i, "var_names"], " is considered as a continuous variable,", " the provided levels order will be ignored.") state_order[i, "levels_increasing_order"] <- NA next - } + } # # Discrete var, check the match of unique values in data and values # in levels_increasing_order @@ -403,18 +548,21 @@ check_state_order <- function (input_data, state_order) { # If the values comming from the state_order can not be converted, # leave the value unchanged to display a meaningful warning laterly # - if (is.logical (values)) { + if (is.logical (values)) + { suppressWarnings ( { orders_log <- as.logical(orders) } ) orders[!is.na (orders_log)] <- orders_log[!is.na (orders_log)] - } - else if (is.integer (values)) { + } + else if (is.integer (values)) + { suppressWarnings ( { orders_int <- as.integer(orders) } ) orders[!is.na (orders_int)] <- orders_int[!is.na (orders_int)] - } - else if (is.numeric (values)) { + } + else if (is.numeric (values)) + { suppressWarnings ( { orders_num <- as.numeric(orders) } ) orders[!is.na (orders_num)] <- orders_num[!is.na (orders_num)] - } + } orders <- as.character(orders) values <- as.character (values) # @@ -425,21 +573,24 @@ check_state_order <- function (input_data, state_order) { # by default when using read.table or read.csv, other #NA, N/A, ... # will however be discarded later with a less specific warning # - if ( (! ("NA" %in% values)) && ("NA" %in% orders) ) { + if ( (! ("NA" %in% values)) && ("NA" %in% orders) ) + { miic_warning ("state order", "variable ", state_order[i, "var_names"], " has a NA value in the provided levels order. NA can not be used to", " order levels and should not be included in the provided levels order.") orders <- orders[ orders != "NA"] - if ( length (orders) == 0 ) { + if ( length (orders) == 0 ) + { state_order[i, "levels_increasing_order"] <- NA next + } } - } # # Check if some provided levels are not in the data # not_in_data <- is.na (match (orders, values) ) - if ( any (not_in_data) ) { + if ( any (not_in_data) ) + { msg_str <- list_to_str (orders[not_in_data], n_max=10) if (sum (not_in_data) == 1) miic_warning ("state order", "variable ", state_order[i, "var_names"], @@ -450,16 +601,18 @@ check_state_order <- function (input_data, state_order) { " has values ", msg_str, " in the provided levels order not present", " in the data. These values will be ignored.") orders <- orders[!not_in_data] - if ( length (orders) == 0 ) { + if ( length (orders) == 0 ) + { state_order[i, "levels_increasing_order"] <- NA next + } } - } # # Check if missing levels compared to data # absent <- is.na (match (values, orders) ) - if ( any (absent) ) { + if ( any (absent) ) + { msg_str <- list_to_str (values[absent], n_max=10) if (sum (absent) == 1) miic_warning ("state order", "variable ", state_order[i, "var_names"], @@ -473,7 +626,7 @@ check_state_order <- function (input_data, state_order) { " The provided levels order for this variable will be ignored.") state_order[i, "levels_increasing_order"] <- NA next - } + } # # If the levels_increasing_order was not turned into NA, # update the levels_increasing_order to have a clean string without @@ -482,14 +635,15 @@ check_state_order <- function (input_data, state_order) { # state_order will be converted as TRUE/FALSE ) # state_order[i, "levels_increasing_order"] <- paste0 (orders, collapse=",") + } } - } # # Cross checks : check that no var is both contextual and consequence # ctx_and_csq = state_order$is_contextual + state_order$is_consequence ctx_and_csq = (ctx_and_csq >= 2) - if (any (ctx_and_csq)) { + if (any (ctx_and_csq)) + { msg_str <- list_to_str (state_order$var_names[ctx_and_csq], n_max=10) if (sum (ctx_and_csq) == 1) miic_warning ("state order", "the variable ", msg_str, @@ -501,9 +655,9 @@ check_state_order <- function (input_data, state_order) { " variables will be considered as neither contextual nor consequence.") state_order$is_contextual[ctx_and_csq] = 0 state_order$is_consequence[ctx_and_csq] = 0 - } + } return (state_order) -} + } #------------------------------------------------------------------------------- # check_other_df @@ -512,197 +666,431 @@ check_state_order <- function (input_data, state_order) { # - df: the datafame to check, expected to be a 2 columns dataframe. All values # in the dataframe are expected to be variables names. # An invalid dataframe will be ignored, Invalid rows will be discarded +# - state_order: the dataframe returned by check_state_order # - df_name: the datafame name (i.e. :"black box", "true edges") # This value is used only to display messages +# - mode: the MIIC mode # return: the dataframe checked #------------------------------------------------------------------------------- -check_other_df <- function (input_data, df, df_name) { +check_other_df <- function (input_data, state_order, df, df_name, mode) + { if ( is.null(df) ) return (NULL) # # Basic checks # - if ( ! is.data.frame(df) ) { + if ( ! is.data.frame(df) ) + { miic_warning (df_name, "The ", df_name, " parameter, if provided,", " must be a dataframe. The ", df_name, " will be ignored.") return (NULL) - } - - n_cols <- 2 - if (ncol(df) != n_cols) { + } + # + # Factors lead to wrong test results + # + factor_cols <- which (unlist (lapply (df, is.factor) ) ) + for (i in factor_cols) + df[,i] <- as.character (df[,i]) + # + # Check number of cols + # + if (mode %in% MIIC_TEMPORAL_MODES) + { + input_data = input_data[,2:ncol(input_data)] + n_cols <- 3 + } + else + n_cols <- 2 + if (ncol(df) != n_cols) + { miic_warning (df_name, "The expected dataframe must have ", n_cols, " columns but the provided one has ", ncol(df), " and will be ignored.") return (NULL) - } - - if (nrow(df) == 0) { + } + if (nrow(df) == 0) + { miic_warning (df_name, "The provided dataframe is empty.") return (df) - } + } data_var_names <- colnames (input_data) rows_with_warning <- c() - for ( row_idx in 1:nrow(df) ) { - for (col_idx in 1:2) { + for ( row_idx in 1:nrow(df) ) + { + for (col_idx in 1:2) + { one_var_name <- df[row_idx, col_idx] - if (! (one_var_name %in% data_var_names) ) { + if (! (one_var_name %in% data_var_names) ) + { miic_warning (df_name, "The variable ", one_var_name, " is not present in the input data. The row ", row_idx, " will be ignored.") rows_with_warning[[length(rows_with_warning)+1]] <- row_idx + } } - } if ( (length(rows_with_warning) > 0) && (rows_with_warning[[length(rows_with_warning)]] == row_idx) ) next - if (df[row_idx, 1] == df[row_idx, 2]) { + if ( ( ! (mode %in% MIIC_TEMPORAL_MODES) ) + && (df[row_idx, 1] == df[row_idx, 2]) ) + { miic_warning (df_name, "the variables must be different for each row (found ", df[row_idx, 1], " two times at row ", row_idx, "). This row will be ignored.") rows_with_warning[[length(rows_with_warning)+1]] <- row_idx + } } - } rows_ok <- unlist (lapply (1:nrow(df), FUN=function (x) { ! (x %in% rows_with_warning) } ) ) df <- df [rows_ok, ] + if (nrow(df) == 0) + { + miic_warning (df_name, "The provided dataframe is empty.") + return (df) + } + # + # In temporal mode, check that the 3rd columns is integer >= 0 (lags) + # + if (mode %in% MIIC_TEMPORAL_MODES) + { + wrong_lags = unlist (lapply (df[,3], FUN=function(x) { + if ( is.null (x) ) # NULL: KO + return (TRUE) + if ( is.na (x) ) # NA: OK for now + return (FALSE) + else if ( is.na ( suppressWarnings (as.numeric(x)) ) ) # Not num: KO + return (TRUE) + else if ( round(as.numeric(x),0) != as.numeric(x) ) # Not int: KO + return (TRUE) + else if ( (as.numeric(x) < 0) ) # <0: KO + return (TRUE) + else + return (FALSE) # OK + } ) ) + if ( any (wrong_lags) ) + { + msg_str <- list_to_str (which(wrong_lags), n_max=10) + if (sum (wrong_lags) == 1) + miic_warning (df_name, "lag is incorrect at row ", msg_str, + ", this line will be ignored.") + else + miic_warning (df_name, "lag is incorrect for multiple rows (", msg_str, + "), these rows will be ignored.") + df <- df [!wrong_lags, ] + } + if (nrow(df) == 0) + { + miic_warning (df_name, "The provided dataframe is empty.") + return (df) + } + # + # Check that contextual lag are NA + # + contextuals = unlist ( apply ( df, MARGIN=1, FUN=function (x) { + orig_idx = which (state_order$var_names == x[[1]]) + dest_idx = which (state_order$var_names == x[[2]]) + return ( (state_order[orig_idx, "is_contextual"] == 1) + || (state_order[dest_idx, "is_contextual"] == 1) ) } ) ) + wrongs_ctx= ( contextuals & ( ! is.na (df[,3]) ) ) # NA: OK for now + if ( any (wrongs_ctx) ) + { + if (sum (wrongs_ctx) == 1) + miic_warning (df_name, "lags for contextual variables must be NA.", + " The line ", df[wrongs_ctx, 1], " - ", df[wrongs_ctx, 2], " lag ", + df[wrongs_ctx, 3], " will be ignored.") + else + miic_warning (df_name, "lags for contextual variables must be NAs. ", + sum (wrongs_ctx), " wrong line will be ignored.") + } + # + # Check that lag >= 0 if not contextual + # + wrongs_lagged = ( (!contextuals) & is.na (df[,3]) ) + if ( any (wrongs_lagged) ) + { + if (sum (wrongs_lagged) == 1) + miic_warning (df_name, "lag for non contextual variables must be >= 0.", + " The line ", df[wrongs_lagged, 1], " - ", df[wrongs_lagged, 2], " lag ", + df[wrongs_lagged, 3], " will be ignored.") + else + miic_warning (df_name, "lags for non contextual variables must be >= 0. ", + sum (wrongs_lagged), " wrong lines will be ignored.") + } + # + # The self loops need a lag > 0 + # + wrongs_selfs = ( (!contextuals) & (df[,1] == df[,2]) & (df[,3] == 0) ) + if ( any (wrongs_selfs) ) + { + if (sum (wrongs_selfs) == 1) + miic_warning (df_name, "lag for self loops must be > 0.", + " The line ", df[wrongs_selfs, 1], " - ", df[wrongs_selfs, 2], " lag ", + df[wrongs_selfs, 3], " will be ignored.") + else + miic_warning (df_name, "lags for self loops must be > 0. ", + sum (wrongs_selfs), " wrong lines will be ignored.") + } + + df <- df [ (!wrongs_ctx) & (!wrongs_lagged) & (!wrongs_selfs), ] + if (nrow(df) == 0) + { + miic_warning (df_name, "The provided dataframe is empty.") + return (df) + } + } # # Remove duplicate row + # n_rows_sav = nrow(df) + # # Equal rows + # df = unique (df) rownames(df) = NULL + # # Equal rows but with variable names swapped + # rows_kept = rep (T, nrow(df)) for (i in 1:nrow(df)) { if ( ! rows_kept[[i]] ) next - dup_inverse = ( (df[,1] == df[i,2]) - & (df[,2] == df[i,1]) - & (rownames(df) != i) ) + if (mode %in% MIIC_TEMPORAL_MODES) + { + # In temporal mode, lag != 0 with variable swapped are not duplicate + # + if ( (!is.na(df[i,3])) && (df[i,3] != 0) ) + next + dup_inverse = ( (df[,1] == df[i,2]) + & (df[,2] == df[i,1]) + & (rownames(df) != i) + & (is.na(df[,3]) | (df[,3] == 0)) ) + } + else + dup_inverse = ( (df[,1] == df[i,2]) + & (df[,2] == df[i,1]) + & (rownames(df) != i) ) rows_kept = rows_kept & (!dup_inverse) } df = df[rows_kept,] - if ( n_rows_sav != nrow(df) ) { + if ( n_rows_sav != nrow(df) ) + { if (n_rows_sav - nrow(df) == 1) miic_warning (df_name, "1 row is duplicated. Only one instance", " of the row will be used.") else miic_warning (df_name, n_rows_sav - nrow(df), " rows are duplicated.", " Only one instance of these rows will be used.") - } + } if (nrow(df) == 0) miic_warning (df_name, "The provided dataframe is empty.") return (df) -} + } #------------------------------------------------------------------------------- -# Check_parameters +# check_param_string #------------------------------------------------------------------------------- -# Check all input parameters that are not df. -# Params : input_data + all possible parameters of miic method -# Returns: a list with all the parameters, eventually modified or initialized +# Params : +# - value: the parameter to check +# - name: the name of the parameter +# - list: the possible values +# Returns: the checked parameter, eventually reset to its default value #------------------------------------------------------------------------------- -check_parameters <- function (input_data, n_threads, cplx, - orientation, ori_proba_ratio, ori_consensus_ratio, propagation, latent, - n_eff, n_shuffles, conf_threshold, sample_weights, test_mar, - consistent, max_iteration, consensus_threshold, negative_info, verbose) { - list_ret = list() - if ( is.null (n_threads) || (!is.numeric(n_threads)) - || (round(n_threads,0) != n_threads) || (n_threads < 1) ) { - miic_warning ("parameters", "supplied value ", n_threads, - " for the n_threads parameter is invalid. It must be an integer >= 1.", - " The default value (1) will be used.") - n_threads = 1 +check_param_string <- function (value, name, possibles) + { + if ( is.null(value) + || (length (value) != 1) + || is.na(value) + || (!is.character(value)) + || (!(value %in% possibles)) ) + { + msg_str = paste0 (paste0 ("'", possibles, "'"), collapse=", ") + if ( is.null (value) ) + val_str = "NULL" + else + val_str = list_to_str (value) + miic_warning ("parameters", "supplied value '", val_str, + "' for the ", name, " parameter is invalid. Possible values are: ", + msg_str, ". The default value ('", possibles[[1]], "') will be used.") + value = possibles[[1]] + } + return (value) } - list_ret$n_threads = n_threads - if ( is.null (cplx) - || ( ! is.character(cplx) ) - || ( ! (cplx %in% c("nml", "mdl") ) ) ) { - miic_warning ("parameters", "supplied value ", cplx, - " for the complexity parameter is invalid. It must be 'nml' or 'mdl'.", - " The default value ('nml') will be used.") - cplx = "nml" +#------------------------------------------------------------------------------- +# check_param_logical +#------------------------------------------------------------------------------- +# Params : +# - value: the parameter to check +# - name: the name of the parameter +# - default: the default value +# Returns: the checked parameter, eventually reset to its default value +#------------------------------------------------------------------------------- +check_param_logical <- function (value, name, default) + { + if ( is.null (value) + || (length (value) != 1) + || is.na (value) + || (!is.logical(value)) ) + { + if ( is.null (value) ) + val_str = "NULL" + else + val_str = list_to_str (value) + miic_warning ("parameters", "supplied value ", val_str, + " for the ", name, " parameter is invalid. It must be TRUE/FALSE.", + " The default value (", default, ") will be used.") + value = default + } + return (value) + } + +#------------------------------------------------------------------------------- +# test_param_wrong_int +#------------------------------------------------------------------------------- +# Params : +# - value: the parameter to check +# - min: the min value, NA if none +# - max: the max values, NA if none +# Returns: TRUE if the value is not an int or not in the range, FALSE otherwise +#------------------------------------------------------------------------------- +test_param_wrong_int <- function (value, min=NA, max=NA) + { + return ( is.null (value) + || (length (value) != 1) + || is.na (value) + || (!is.numeric(value)) + || (round(value,0) != value) + || ((!is.na (min)) && (value < min)) + || ((!is.na (max)) && (value > max)) ) + } + +#------------------------------------------------------------------------------- +# check_param_int +#------------------------------------------------------------------------------- +# Params : +# - value: the parameter to check +# - name: the name of the parameter +# - min_max: a tuple with min and max values. NA if no min and/or no max +# - default: the default value +# Returns: the checked parameter, eventually reset to its default value +#------------------------------------------------------------------------------- +check_param_int <- function (value, name, default, min=NA, max=NA) + { + if ( test_param_wrong_int (value, min, max) ) + { + msg_str = " It must be an integer." + if ( (!is.na(min)) && (!is.na(max)) ) + msg_str = paste0 (" It must be an integer in the range [", + min, ", ", max, "].") + else if ( ! is.na (min) ) + msg_str = paste0 (" It must be an integer >= ", min, ".") + else if ( ! is.na (max) ) + msg_str = paste0 (" It must be an integer <= ", max, ".") + if ( is.null (value) ) + val_str = "NULL" + else + val_str = list_to_str (value) + miic_warning ("parameters", "supplied value ", val_str, + " for the ", name, " parameter is invalid." , msg_str, + " The default value (", default, ") will be used.") + value = default + } + return (value) } - list_ret$cplx = cplx - if ( is.null (orientation) || (!is.logical(orientation)) ) { - miic_warning ("parameters", "supplied value ", orientation, - " for the orientation parameter is invalid. It must be TRUE/FALSE.", - " The default value (TRUE) will be used.") - orientation = TRUE +#------------------------------------------------------------------------------- +# test_param_wrong_float +#------------------------------------------------------------------------------- +# Params : +# - value: the parameter to check +# - min: the min value, NA if none +# - max: the max values, NA if none +# Returns: TRUE if the value is not a float or not in the range, FALSE otherwise +#------------------------------------------------------------------------------- +test_param_wrong_float <- function (value, min=NA, max=NA) + { + return ( is.null (value) + || (length (value) != 1) + || is.na (value) + || (!is.numeric(value)) + || ((!is.na (min)) && (value < min)) + || ((!is.na (max)) && (value > max)) ) } - list_ret$orientation = orientation - if ( is.null (ori_proba_ratio) || (!is.numeric(ori_proba_ratio)) - || (ori_proba_ratio < 0) || (ori_proba_ratio > 1) ) { +#------------------------------------------------------------------------------- +# check_parameters +#------------------------------------------------------------------------------- +# Check all input parameters that are not df and not temporal +# Params : +# - input_data: a dataframe with input data +# - all possible parameters of miic method +# Returns: a list with all the parameters, eventually modified or initialized +#------------------------------------------------------------------------------- +check_parameters <- function (input_data, n_threads, cplx, + orientation, ori_proba_ratio, ori_consensus_ratio, propagation, latent, + n_eff, n_shuffles, conf_threshold, sample_weights, test_mar, + consistent, max_iteration, consensus_threshold, + mode, negative_info, verbose) { + list_ret = list ("mode" = mode) + list_ret$n_threads = check_param_int (n_threads, "n_threads", 1, min=1, max=NA) + list_ret$cplx = check_param_string (cplx, "complexity", c("nml", "mdl")) + list_ret$orientation = check_param_logical (orientation, "orientation", TRUE) + + if ( test_param_wrong_float (ori_proba_ratio, min=0, max=1) ) + { miic_warning ("parameters", "supplied value ", ori_proba_ratio, " for the orientation probabilty ratio parameter is invalid.", " It must be a floating point between 0 and 1.", " The default value (1) will be used.") ori_proba_ratio = 1 - } + } list_ret$ori_proba_ratio = ori_proba_ratio if ( is.null (ori_consensus_ratio) ) - ori_consensus_ratio = ori_proba_ratio - else if ( (!is.numeric (ori_consensus_ratio)) - || (ori_consensus_ratio < 0) || (ori_consensus_ratio > 1) ) { + ori_consensus_ratio = list_ret$ori_proba_ratio + else if ( test_param_wrong_float (ori_consensus_ratio, min=0, max=1) ) + { miic_warning ("parameters", "supplied value ", ori_consensus_ratio, " for the orientation concensus ratio parameter is invalid.", " It must be a floating point between 0 and 1.", " The default value (same as orientation probabilty ratio: ", ori_proba_ratio, ") will be used.") - ori_consensus_ratio = ori_proba_ratio - } + ori_consensus_ratio = list_ret$ori_proba_ratio + } list_ret$ori_consensus_ratio = ori_consensus_ratio - if ( is.null (propagation) || (!is.logical(propagation)) ) { - miic_warning ("parameters", "supplied value ", propagation, - " for the propagation parameter is invalid. It must be TRUE/FALSE.", - " The default value (FALSE) will be used.") - propagation = FALSE - } - list_ret$propagation = propagation - - if ( is.null(latent) || (!(latent %in% MIIC_VALID_LATENT)) ) { - msg_str = paste (MIIC_VALID_LATENT, collapse=", ") - miic_warning ("parameters", "supplied value ", latent, - " for the latent parameter is invalid. Possible values are: ", msg_str, - ". The default value (", MIIC_VALID_LATENT[[1]], ") will be used.") - latent = MIIC_VALID_LATENT[[1]] - } - list_ret$latent = latent + list_ret$propagation = check_param_logical (propagation, "propagation", FALSE) + list_ret$latent = check_param_string (latent, "latent", MIIC_VALID_LATENT) - if ( is.null(n_eff) || (!is.numeric(n_eff)) || (round(n_eff,0) != n_eff) - || ((n_eff <= 0) && (n_eff != -1)) || (n_eff > nrow(input_data)) ) { + if ( test_param_wrong_int (n_eff, min=-1, max=nrow(input_data) ) + || (n_eff == 0) ) + { miic_warning ("parameters", "supplied value ", n_eff, " for the number of effective samples is invalid.", " The number of effective samples must be an integer that can be -1", " for an automatic assignment or a positive number less or", " equal to the number of samples. The default value (-1) will be used.") n_eff = -1 - } + } list_ret$n_eff = n_eff - if ( is.null(n_shuffles) || (!is.numeric(n_shuffles)) - || (round(n_shuffles,0) != n_shuffles) || (n_shuffles < 0) ) { - miic_warning ("parameters", "supplied value ", n_shuffles, - " for the number of shufflings is invalid. It must be an integer >= 0.", - " The default value (0) will be used.") - n_shuffles = 0 - } - - if (n_shuffles == 0) { - if ( (!is.null (conf_threshold)) && (conf_threshold != 0) ) - miic_warning ("parameters", "supplied value ", conf_threshold, + n_shuffles = check_param_int (n_shuffles, "number of shufflings", 0, min=0, max=NA) + if (n_shuffles == 0) + { + if ( (length(conf_threshold) > 1) + || ( (!is.null (conf_threshold)) + && (!is.na (conf_threshold)) + && (conf_threshold != 0) ) ) + miic_warning ("parameters", "supplied value ", list_to_str (conf_threshold), " for the confidence threshold parameter will be ignored", " as the number of shufflings is set to 0.", " To activate the confidencence cut, both the number of shufflings", + " and the confidence threshold must be > 0 (i.e.: n_shuffles = 100", " and conf_threshold = 0.01).") conf_threshold = 0 - } else { - if ( is.null (conf_threshold) - || (!is.numeric(conf_threshold)) - || (conf_threshold < 0) ) { + } + else + { + if ( test_param_wrong_float (conf_threshold, min=0, max=NA) ) + { miic_warning ("parameters", "supplied value ", conf_threshold, " for the confidence threshold parameter is invalid.", " When confidence cut is activated (when n_shuffles > 0),", @@ -711,7 +1099,9 @@ check_parameters <- function (input_data, n_threads, cplx, " for the number of shufflings (0) and the confidence threshold (0).") n_shuffles = 0 conf_threshold = 0 - } else if (conf_threshold == 0) { + } + else if (conf_threshold == 0) + { miic_warning ("parameters", "the confidence threshold parameter is 0", " but it must be > 0 when confidence cut is activated", " (when n_shuffles > 0). The confidence cut will be desactivated.", @@ -719,87 +1109,73 @@ check_parameters <- function (input_data, n_threads, cplx, " and the confidence threshold must be > 0 (i.e.: n_shuffles = 100", " and conf_threshold = 0.01).") n_shuffles = 0 + } } - } list_ret$n_shuffles = n_shuffles list_ret$conf_threshold = conf_threshold if ( ( ! is.null (sample_weights) ) && ( (length(sample_weights) != nrow(input_data)) + || (any(is.na (sample_weights))) || (any(!is.numeric(sample_weights))) || (any(sample_weights < 0)) - || (any(sample_weights > 1)) ) ) { + || (any(sample_weights > 1)) ) ) + { miic_warning ("parameters", "supplied value for the sample_weights parameter", " is invalid. It must be a vector of the same size as the number of", " samples in the input data and all weights must be floating points", " in the [0,1] range. The parameter will be ignored.") sample_weights = NULL - } + } list_ret$sample_weights = sample_weights - if ( is.null (test_mar) || (!is.logical(test_mar)) ) { - miic_warning ("parameters", "Supplied value ", test_mar, - " for the missing at random test parameter is invalid.", - " It must be TRUE/FALSE. The default value (TRUE) will be used.") - test_mar = TRUE - } - list_ret$test_mar = test_mar - - if ( is.null(consistent) || (!(consistent %in% MIIC_VALID_CONSISTENT)) ) { - msg_str = paste (MIIC_VALID_CONSISTENT, collapse=", ") - miic_warning ("parameters", "supplied value ", consistent, - " for the consistent parameter is invalid. Possible values are: ", msg_str, - ". The default value (", MIIC_VALID_CONSISTENT[[1]], ") will be used.") - consistent = MIIC_VALID_CONSISTENT[[1]] - } - list_ret$consistent = consistent + list_ret$test_mar = check_param_logical (test_mar, "missing at random test", TRUE) + list_ret$consistent = check_param_string (consistent, "consistent", MIIC_VALID_CONSISTENT) - if (consistent == "no") { - if ( (!is.null (max_iteration)) && (max_iteration != 100) ) - miic_warning ("parameters", "supplied value ", max_iteration, + if (list_ret$consistent == "no") + { + if ( (length (max_iteration) > 1) + || ( (!is.null (max_iteration)) + && (!is.na (max_iteration)) + && (max_iteration != 100) ) ) + miic_warning ("parameters", "supplied value ", list_to_str(max_iteration), " for the maximum iteration parameter will not be used", " as consistency is off.") max_iteration = 100 - if ( (!is.null (consensus_threshold)) && (consensus_threshold != 0.8) ) - miic_warning ("parameters", "Supplied value ", consensus_threshold, + if ( (length (consensus_threshold) > 1) + || ( (!is.null (consensus_threshold)) + && (!is.na (consensus_threshold)) + && (consensus_threshold != 0.8) ) ) + miic_warning ("parameters", "Supplied value ", list_to_str(consensus_threshold), " for the consensus threshold parameter will not be used", " as consistency is off.") consensus_threshold = 0.8 - } else { # Consistency on - if ( is.null (max_iteration) || (!is.numeric(max_iteration)) - || (round(max_iteration,0) != max_iteration) || (max_iteration <= 0) ) { + } + else # Consistency on + { + if ( test_param_wrong_int (max_iteration, min=1, max=NA) ) + { miic_warning ("parameters", "supplied value ", max_iteration, " for the maximum iteration parameter is invalid.", " It must be a stricly positive integer when consistency is activated.", " The default value (100) will be used.") max_iteration = 100 - } - if ( is.null (consensus_threshold) || (!is.numeric(consensus_threshold)) - || (consensus_threshold < 0.5) || (consensus_threshold > 1) ) { + } + if ( test_param_wrong_float (consensus_threshold, min=0.5, max=1) ) + { miic_warning ("parameters", "supplied value ", consensus_threshold, " for the consensus threshold parameter is invalid.", " It must be a floating point between 0.5 and 1 when consistency is", " activated. The default value (0.8) will be used.") consensus_threshold = 0.8 + } } - } list_ret$max_iteration = max_iteration list_ret$consensus_threshold = consensus_threshold - if ( is.null (negative_info) || (!is.logical(negative_info)) ) { - miic_warning ("parameters", "supplied value ", negative_info, " for", - " parameter allowing/disallowing negative shifted mutual information is", - " invalid. It must be TRUE/FALSE. The default value (FALSE) will be used.") - negative_info = FALSE - } - list_ret$negative_info = negative_info + list_ret$negative_info = check_param_logical (negative_info, + "allowing/disallowing negative shifted mutual information", FALSE) + list_ret$verbose = check_param_logical (verbose, "verbose", FALSE) - if ( is.null (verbose) || (!is.logical(verbose)) ) { - miic_warning ("parameters", "supplied value ", verbose, - " for the verbose parameter is invalid. It must be TRUE/FALSE.", - " The default value (FALSE) will be used.") - verbose = FALSE - } - list_ret$verbose = verbose return (list_ret) -} + } diff --git a/R/parseResults.R b/R/parseResults.R index 81d74dd4..071236e2 100644 --- a/R/parseResults.R +++ b/R/parseResults.R @@ -18,7 +18,7 @@ summarizeResults <- function(observations = NULL, results = NULL, predicted_edges <- apply(predicted_edges, 2, function(x) { var_names[x] }) # Add to summarized edges list if(length(predicted_edges > 0)) summarized_edges <- predicted_edges - + edges <- results$edges # List of negative edges with non null conditioning set conditioned_edges <- as.matrix(edges[(!is.na(edges$ai.vect)) & @@ -29,7 +29,7 @@ summarizeResults <- function(observations = NULL, results = NULL, # Add to summarized edges list summarized_edges <- rbind(summarized_edges, conditioned_edges) #indep_null_cond_set_edges) - + if (!is.null(true_edges)) { # List of False Negative edges false_negative_edges <- data.frame( @@ -51,7 +51,6 @@ summarizeResults <- function(observations = NULL, results = NULL, ) } - n <- nrow(summarized_edges) summary <- data.frame( x = character(n), y = character(n), type = character(n), ai = character(n), diff --git a/R/tmiic.plot.R b/R/tmiic.plot.R new file mode 100644 index 00000000..acb66542 --- /dev/null +++ b/R/tmiic.plot.R @@ -0,0 +1,1190 @@ +#******************************************************************************* +# Filename : tmiic.plot.R Creation date: 24 march 2020 +# +# Description: Plotting for temporal miic (tmiic) +# +# Author : Franck SIMON +#******************************************************************************* + +#------------------------------------------------------------------------------- +# tmiic.export +#------------------------------------------------------------------------------- +#' Export temporal miic (tmiic) result to different plotting methods +#' +#' @description This function creates an object built from the result returned +#' by \code{\link{miic}} executed in temporal mode that is ready to be fed to +#' different plotting methods. +#' +#' @param tmiic_res [a tmiic object] +#' The object returned by the \code{\link{miic}} execution in temporal mode. +#' +#' @param display [a string]. Optional, default value "compact". +#' Possible values are \emph{"raw"}, \emph{"lagged"}, \emph{"compact"}, +#' \emph{"combine"}, \emph{"unique"}, \emph{"drop"}: +#' \itemize{ +#' \item When \emph{display} = \emph{"raw"}, the export function will +#' use the tmiic graph object as it, leading to the return of a lagged +#' graph. +#' \item When \emph{display} = \emph{"lagged"}, the export function will +#' repeat the edges over history assuming stationarity and return a lagged +#' graph. +#' \item When \emph{display} = \emph{"compact"}, the default, nodes +#' and edges are converted into a flattened version to produce a compact +#' view of the temporal network whilst still presenting all the information +#' in the export.\cr +#' i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 become respectively X->Y lag=1, +#' X<-Y lag=2. +#' \item When \emph{display} = \emph{"combine"}, prior to the export, +#' a pre-processing will be applied to kept only one edge +#' per couple of nodes. The info_shifted will be the highest one +#' of the summarized edges whilst the lag and orientation of the +#' summarized edge will be an aggregation.\cr +#' i.e.: X_lag2->Y_lag0, X_lag0<-Y_lag1 will become X<->Y lag=1-2 with +#' the info_shifted of X_lag2->Y_lag0 if info_shifted of +#' X_lag2->Y_lag0 > X_lag0<-Y_lag1. +#' \item When \emph{display} = \emph{"unique"}, prior to the export, +#' a pre-processing will be applied to kept only the edges having the +#' highest info_shifted for a couple of nodes. +#' If several edges between the sames nodes have the same +#' info_shifted, then the edge kept is the one with the minimum lag.\cr +#' i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +#' X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y lag=1. +#' \item When \emph{display} = \emph{"drop"}, prior to the export, +#' a pre-processing will be applied to kept only the edges having the +#' highest info_shifted for a couple of nodes. +#' If several edges between the sames nodes have the same +#' info_shifted, then the edge kept is the one with the minimum lag.\cr +#' i.e. : X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +#' X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y. +#' The lag information is dropped during the preprocessing and +#' will not be exported. +#' } +#' +#' @param show_self_loops [a boolean] Optional, TRUE by default. +#' When TRUE, the edges like X_lag0-X_lag1 are exported. +#' When FALSE, only edges having different nodes are exported. +#' +#' @param method A string representing the plotting method. +#' Currently only "igraph" is supported. +#' +#' @param pcor_palette Optional. The color palette used to represent the partial +#' correlations (the color of the edges). See \code{\link{getIgraph}} for details. +#' +#' @return A graph object adapted to the method. +#' +#' @export +#' +#' @examples +#' \donttest{ +#' library(miic) +#' data(covidCases) +#' # execute MIIC (reconstruct graph in temporal mode) +#' tmiic_res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) +#' +#' # Plot default compact temporal network Using igraph +#' if(require(igraph)) { +#' g = tmiic.export(tmiic_res, method="igraph") +#' plot(g) # Default visualisation, calls igraph::plot.igraph() +#' +#' # Plot raw temporal network Using igraph +#' g = tmiic.export(tmiic_res, display="raw", method="igraph") +#' plot(g) # Default visualisation, calls igraph::plot.igraph() +#' +#' # Plot full temporal network Using igraph +#' g = tmiic.export(tmiic_res, display="lagged", method="igraph") +#' plot(g) # Default visualisation, calls igraph::plot.igraph() +#' +#' # Specifying layout (see ?igraph::layout_) +#' l <- layout_on_grid(g, width = 5, height = 3, dim = 2) +#' plot(g, layout=l) +#' +#' # Override some graphical parameters +#' plot(g, edge.arrow.size = 0.75) +#' plot(g, vertex.shape="none", edge.color="gray85", vertex.label.color="gray10") +#' +#' # For compact graphs, please be aware that the rendering of +#' # igraph::plot.igraph() is not optimal when the graph contains +#' # multiple edges between the same nodes. +#' # So the recommend way to plot a compact graph is to use tmiic plotting: +#' plot(tmiic_res) +#' } +#' +#' } +#------------------------------------------------------------------------------- +tmiic.export <- function (tmiic_res, display="compact", show_self_loops=TRUE, + method="igraph", pcor_palette=NULL) + { + if (is.null(tmiic_res$all.edges.summary)) + stop("Error: The inferred network does not exist") + if (is.null(method)) + stop("Error: Plotting method is required") + if (method != "igraph") + stop("Error: Method not supported") + return(tmiic_getIgraph(tmiic_res, display=display, + show_self_loops=show_self_loops, + pcor_palette=pcor_palette)) + } + +#------------------------------------------------------------------------------- +# tmiic_getIgraph +#------------------------------------------------------------------------------- +# Igraph plotting function for tmiic (temporal mode of miic) +# +# This functions returns an igraph object built from the result of the miic +# execution in temporal mode +# +# Edges attributes are passed to the igraph graph and can be accessed with +# e.g. E(g)$partial_correlation. See miic() for more details on edge parameters. +# By default, edges are colored according to the partial correlation between +# two nodes conditioned on the conditioning set (negative is blue, +# null is gray and positive is red) and their width is based on the +# conditional mutual information minus the complexity cost. +# +# params: +# - tmiic_res: a tmiic object, returned by the miic execution in temporal mode +# +# - display: string. Optional, default value "compact". +# Possible values are "raw", "lagged", "compact", "combine", "unique", "drop": +# * "raw": the function will use the tmiic graph object as it, +# leading to the return of a lagged graph. +# * "lagged", the function will use the repeated the edges over history +# assuming stationarity and return a lagged graph. +# * "compact", the default, nodes and edges are converted into a flattened +# version to produce a compact view of the temporal network +# whilst still presenting all the information. +# i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 become respectively X->Y lag=1, +# X<-Y lag=2. +# * "combine", a pre-processing will be applied to kept only one edge +# per couple of nodes. The info_shifted will be the highest one +# of the summarized edges whilst the lag and orientation of the +# summarized edge will be an aggregation. +# i.e.: X_lag2->Y_lag0, X_lag0<-Y_lag1 will become X<->Y lag=1,2 with +# the info_shifted of X_lag2->Y_lag0 if info_shifted of +# X_lag2->Y_lag0 > X_lag0<-Y_lag1. +# * "unique", a pre-processing will be applied to kept only the edges +# having the highest info_shifted for a couple of nodes. +# If several edges between the sames nodes have the same +# info_shifted, then the edge kept is the one with the minimum lag. +# i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +# X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y lag=1. +# * "drop"}, prior to the plotting, a pre-processing will be applied +# to kept only the edges having the highest info_shifted for a couple +# of nodes. +# If several edges between the sames nodes have the same +# info_shifted, then the edge kept is the one with the minimum lag. +# i.e. : X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +# X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y. +# The lag information is dropped during the preprocessing. +# +# - show_self_loops: boolean, optional, TRUE by default. +# When TRUE, the edges like X_lag0-X_lag1 are included in the iGraph object. +# When FALSE, only edges having different nodes are present in the iGraph +# object. +# +# - pcor_palette: optional. The color palette used to represent the partial +# correlations (the color of the edges). See getIgraph for details. +# +# returns: an igraph graph object. +#------------------------------------------------------------------------------- +tmiic_getIgraph <- function (tmiic_res, display="compact", + show_self_loops=TRUE, pcor_palette=NULL) + { + if (display == "lagged") + tmiic_res$all.edges.summary = tmiic_res$tmiic$all.edges.stationarity + else if (display != "raw") + tmiic_res <- tmiic_flatten_network (tmiic_res, flatten_mode=display, + keep_edges_on_same_node=show_self_loops) + + graph <- getIgraph (tmiic_res, pcor_palette=pcor_palette) + + if (display %in% c("raw", "lagged") ) + { + igraph::V(graph)$label.dist = 1 + igraph::V(graph)$label.degree = pi/2 + igraph::E(graph)$curved = TRUE + } + else + { + igraph::E(graph)$curved = FALSE + if ( "lag" %in% colnames(tmiic_res$all.edges.summary) ) + igraph::E(graph)$label <- tmiic_res$all.edges.summary$lag + } + return(graph) + } + +#------------------------------------------------------------------------------- +# tmiic_prepare_edges_for_plotting +#------------------------------------------------------------------------------- +# Prepare the edges for plotting +# +# This function firstly filters the edges in the summary to keep +# only the ones detected by miic and adds to every edge an id constructed +# using the couple of nodes ordered alphabetically ("node1" < "node2") +# +# params: the tmiic object returned by the miic execution in temporal mode, +# eventually flattened +# +# @return tmiic_res [a tmiic object] The modified tmiic object +#----------------------------------------------------------------------------- +tmiic_prepare_edges_for_plotting <- function (tmiic_res) + { + df_edges <- tmiic_res$all.edges.summary[tmiic_res$all.edges.summary$type %in% c('P', 'TP', 'FP'), ] + if (nrow(df_edges) <= 0) + df_edges$xy = character(0) + else + { + # Ensure all edges have an id xy where x < y + # + df_edges$xy = NULL + for (edge_idx in 1:nrow(df_edges)) + { + one_edge <- df_edges[edge_idx,] + if (one_edge$x < one_edge$y) + df_edges[edge_idx, "xy"] <- paste (one_edge$x, "-", one_edge$y, sep="") + else + df_edges[edge_idx, "xy"] <- paste (one_edge$y, "-", one_edge$x, sep="") + } + # + # Order the edges so that all orientations goes from x to y + # + for(row in 1:nrow(df_edges)) + { + if(df_edges[row, "infOrt"] == -2) + { + df_edges[row, c("x","y")] = df_edges[row, c("y","x")] + df_edges[row, "infOrt"] = 2 + if(!is.na(df_edges[row, "proba"])) + df_edges[row, "proba"] = paste0(rev( + strsplit(df_edges[row, "proba"], ";")[[1]]), collapse=";") + if(!is.na(df_edges[row, "trueOrt"])) + df_edges[row, "trueOrt"] = 2 + } + } + } + tmiic_res$all.edges.summary <- df_edges + return (tmiic_res) + } + +#------------------------------------------------------------------------------- +# tmiic_get_multiple_edges_for_plotting +#------------------------------------------------------------------------------- +# Look for mutiple edges (that needs specific plotting) +# +# @description This function identifies the couple of nodes having mutiples +# edges +# +# @param [a tmiic graph object] +# The graph object returned by the miic execution in temporal mode and +# flattened (if the tmiic object is not flattened, the function does nothing) +# +# @return df_mult [a dataframe] The dataframe containing the multiple edges +#------------------------------------------------------------------------------- +tmiic_get_multiple_edges_for_plotting <- function (tmiic_res) + { + df_mult <- tmiic_res$all.edges.summary + if (nrow(df_mult) <= 0) + df_mult$count <- numeric(0) + else + { + df_mult$count <- 1 + df_mult <- stats::aggregate(data.frame(count = df_mult$count), + by = list(xy = df_mult$xy), sum) + } + df_mult <- df_mult[df_mult$count > 1,] + return (df_mult) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_row_layout_greedy_base +#------------------------------------------------------------------------------- +# Internal function to compute an optimized raw layout used to construct +# a grid layout for the display of raw and lagged graphs +# +# The function starts by choosing two nodes: the node with the +# maximum degree and the one sharing the most of edges with the first. +# Then, the other nodes are placed recurvely using these two nodes. +# If some nodes are still not positioned after the recursion, +# the whole process is done over until all nodes are placed. +# +# @param list_nodes [a list] The list of nodes to be positioned +# @param df_edges [a dataframe] The list of edges, with the edges nodes +# stored in columns x and y and count columns (>1 when edge exists with +# multiple lags between the two nodes) +# +# @return [a list] The list of nodes ordered to avoid crossing edges when +# the network is displayed as raw or lagged graphs +#------------------------------------------------------------------------------- +tmiic_compute_row_layout_greedy_base <- function (list_nodes, df_edges) + { + if (length (list_nodes) <= 0) + return ( list() ) + if (nrow(df_edges) <= 0) + return (list_nodes) + # + # Count nodes degrees + # + df_nodes <- data.frame (nodes=unlist(list_nodes) ) + df_nodes$degree <- 0 + for (i in 1:nrow(df_nodes)) + df_nodes[i,2] <- sum ( (df_edges$x == df_nodes[i,1]) + | (df_edges$y == df_nodes[i,1]) ) + # + # Select first node with the max degree + # + max_degree <- max (df_nodes$degree) + node_chosen <- df_nodes[ (df_nodes$degree == max_degree), 1] + node_chosen <- node_chosen[[1]] + # + # Select node having the maximum number of edges with the max degree one + # + cond_rel_with_chosen <- ( (df_edges$x == node_chosen) + | (df_edges$y == node_chosen) ) + df_rel_with_chosen <- df_edges[cond_rel_with_chosen,] + max_rel_with_chosen <- max (df_rel_with_chosen$count) + edges_max_rel_with_chosen <- df_rel_with_chosen[ df_rel_with_chosen$count == max_rel_with_chosen,] + edge_max_rel_with_chose <- edges_max_rel_with_chosen[1,] + if (edge_max_rel_with_chose$x == node_chosen) + other_node = edge_max_rel_with_chose$y + if (edge_max_rel_with_chose$y == node_chosen) + other_node = edge_max_rel_with_chose$x + # + # Remove the two selected nodes from the lists of nodes and edges + # + cond_edge_chosen_other = ( (df_edges$x == node_chosen) & (df_edges$y == other_node) + | (df_edges$y == node_chosen) & (df_edges$x == other_node) ) + df_edges <- df_edges[(!cond_edge_chosen_other),] + + cond_node_chosen_other <- ( (df_nodes$nodes == node_chosen) + | (df_nodes$nodes == other_node) ) + df_nodes <- df_nodes[ (!cond_node_chosen_other), ] + # + # Compute recursively the positions of nodes in regard of the two selected + # + ret <- tmiic_compute_row_layout_greedy_recurs (node_chosen, other_node, + df_nodes$nodes, df_nodes, df_edges) + # + # If some nodes are still not positioned, do separate graph(s) beside + # + while (! is.null (ret$nodes_no_care) ) + { + # + # Remove all edgees havbing their two nodes positioned + # + cond <- ( (df_edges$x %in% ret$nodes_positioned) + & (df_edges$y %in% ret$nodes_positioned) ) + df_edges <- df_edges[ (!cond), ] + # + # Construct a separate layout with remaining edges + # + ret_next <- tmiic_compute_row_layout_greedy_base (ret$nodes_no_care, df_edges) + ret$nodes_positioned <- c(ret$nodes_positioned, ret_next$nodes_positioned) + ret$nodes_no_care <- ret_next$nodes_no_care + } + return (ret) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_row_layout_greedy_recurs +#------------------------------------------------------------------------------- +# Internal recursive function to compute an optimized raw layout used to +# construct a grid layout for the display of raw and lagged graphs +# +# The function starts by using two nodes as separators: +# the other nodes are placed in sets depending on how they are placed +# regarding the two separators: left, center and rigth. These sets are +# processed in a recursive way until becoming empty, then the backtrack +# generates the lit of nodes representing the raw layout with minimal +# crossing +# +# params: +# - node_left: string, the left separator node +# - node_right; string, the right separator node +# - list_nodes_to_affect: list, the list is nodes to be positioned +# - df_nodes: dataframe, the list of nodes with columns nodes and degree +# - df_edges: dataframe, the list of edges with columns x, y containing +# the nodes and count columns (>1 when edge exists with multiple lags between +# the two nodes) +# +# returns: list, the list of nodes ordered to avoid crossing edges +#------------------------------------------------------------------------------- +tmiic_compute_row_layout_greedy_recurs <- function (node_left, node_right, + list_nodes_to_affect, df_nodes, df_edges) + { + # + # Remove from nodes and edges the right and left nodes that we chose to + # position the others + # + cond_node_right_left <- ( (list_nodes_to_affect == node_left) + | (list_nodes_to_affect == node_right) ) + list_nodes_to_affect <- list_nodes_to_affect[ (!cond_node_right_left)] + + cond_edge_right_left = ( (df_edges$x == node_left) & (df_edges$y == node_right) + | (df_edges$y == node_left) & (df_edges$x == node_right) ) + df_edges <- df_edges[(!cond_edge_right_left),] + + cond_node_right_left <- ( (df_nodes$nodes == node_left) + | (df_nodes$nodes == node_right) ) + df_nodes <- df_nodes[ (!cond_node_right_left), ] + # + # Position the other nodes compared with the right and left chosen nodes + # + nodes_left <- list() + nodes_center <- list() + nodes_right <- list() + nodes_no_care <- list() + for (one_node in list_nodes_to_affect) + { + cond_rel_with_left <- any ( ( (df_edges$x == one_node) | (df_edges$y == one_node) ) + & ( (df_edges$x == node_left) | (df_edges$y == node_left) ) ) + cond_rel_with_right <- any ( ( (df_edges$x == one_node) | (df_edges$y == one_node) ) + & ( (df_edges$x == node_right) | (df_edges$y == node_right) ) ) + cond_rel_with_both <- cond_rel_with_left & cond_rel_with_right + + if (cond_rel_with_both) + { + nodes_center[[(length(nodes_center)+1)]] <- one_node + next + } + if (cond_rel_with_left) + { + nodes_left[[length(nodes_left)+1]] <- one_node + next + } + if (cond_rel_with_right) + { + nodes_right[[length(nodes_right)+1]]<- one_node + next + } + nodes_no_care[[length(nodes_no_care)+1]] <- one_node + } + # + # If there is no interest to position some nodes, end recursion + # + if ( sum(length(nodes_left), length(nodes_center), length(nodes_right)) <= 0) + { + ret <- list (nodes_positioned=unlist(c(node_left, node_right)), + nodes_no_care=unlist(nodes_no_care) ) + return (ret) + } + # + # There is some interest to position some nodes + # + find_node_max_degre <- function (list_possible_nodes, df_nodes) + { + df_nodes <- df_nodes[(df_nodes$nodes %in% list_possible_nodes),] + max_edges <- max(df_nodes$degree) + ret_node <- df_nodes[(df_nodes$degree == max_edges),1] + ret_node <- ret_node[[1]] + return (ret_node) + } + + nodes_positioned_left <- list() + nodes_positioned_center_left <- list() + nodes_positioned_center_right <- list() + nodes_positioned_right <- list() + if (length(nodes_left) > 0) + { + new_node_sep <- find_node_max_degre (nodes_left, df_nodes) + ret <- tmiic_compute_row_layout_greedy_recurs (new_node_sep, node_left, + append (nodes_left, nodes_no_care), + df_nodes, df_edges) + nodes_positioned_left <- ret$nodes_positioned + nodes_no_care <- ret$nodes_no_care + df_nodes <- df_nodes[ (!df_nodes$nodes %in% nodes_positioned_left), ] + } + if (length(nodes_center) > 0) + { + new_node_sep <- find_node_max_degre (nodes_center, df_nodes) + ret <- tmiic_compute_row_layout_greedy_recurs (node_left, new_node_sep, + append (nodes_center, nodes_no_care), + df_nodes, df_edges) + nodes_positioned_center_left <- ret$nodes_positioned + df_nodes <- df_nodes[ (!df_nodes$nodes %in% nodes_positioned_center_left), ] + + ret <- tmiic_compute_row_layout_greedy_recurs (new_node_sep, node_right, + ret$nodes_no_care, + df_nodes, df_edges) + nodes_positioned_center_right <- ret$nodes_positioned + nodes_no_care <- ret$nodes_no_care + df_nodes <- df_nodes[ (!df_nodes$nodes %in% nodes_positioned_center_right), ] + } + if (length(nodes_right) > 0) + { + new_node_sep <- find_node_max_degre (nodes_right, df_nodes) + ret <- tmiic_compute_row_layout_greedy_recurs (node_right, new_node_sep, + append (nodes_right, nodes_no_care), + df_nodes, df_edges) + nodes_positioned_right <- ret$nodes_positioned + nodes_no_care <- ret$nodes_no_care + } + # + # Concat nodes that have been positioned the and return + # + nodes_pos_all <- c(nodes_positioned_left, node_left, nodes_positioned_center_left, + nodes_positioned_center_right, node_right, nodes_positioned_right) + ret <- list (nodes_positioned=unlist(nodes_pos_all), + nodes_no_care=unlist(nodes_no_care) ) + return (ret) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_row_layout_greedy +#------------------------------------------------------------------------------- +# Internal function to compute an optimized grid layout for the display +# of raw and lagged graphs +# +# The function counts edges per couple of nodes whatever their lags are and +# exclude self loop. Then it call tmiic_compute_row_layout_greedy_base +# with the nodes having at least one edge to compute a layer 0 layout. +# The layout is completed with nodes without edges to produce the final +# layer 0 layout. +# +# param: tmiic_res, the object returned by the miic execution in temporal mode +# +# returns: a list, the position along an axis for each node +#------------------------------------------------------------------------------- +tmiic_compute_row_layout_greedy <- function (tmiic_res) + { + list_nodes_not_lagged <- tmiic_res$state_order$var_names + # + # Filter out self edges, count and summarize edges regardless their lags + # + tmiic_flat <- tmiic_flatten_network (tmiic_res) + df_edges <- tmiic_flat$all.edges.summary + df_edges <- df_edges[(df_edges$x != df_edges$y),] + if (nrow (df_edges) == 0) + df_edges$count <- integer() + else + { + for (edge_idx in 1:nrow(df_edges)) + { + one_edge <- df_edges[edge_idx,] + if (one_edge$x >= one_edge$y) + df_edges[edge_idx, c("x","y")] <- c(one_edge$y, one_edge$x) + } + df_edges$count <- 1 + df_edges <- stats::aggregate(data.frame(count = df_edges$count), + by = list(x=df_edges$x, y=df_edges$y), sum) + } + # + # Find nodes not part of an edges or at least part of one edge + # + list_nodes_no_edges <- list() + for (one_node in list_nodes_not_lagged) + if ( (! one_node %in% df_edges$x) & (! one_node %in% df_edges$y) ) + list_nodes_no_edges[(length(list_nodes_no_edges)+1)] <- one_node + + list_nodes_with_edge <- list_nodes_not_lagged[ (!list_nodes_not_lagged %in% list_nodes_no_edges) ] + # + # Compute layer 0 layout (without nodes not part of an edges) + # + ret_recurs <- tmiic_compute_row_layout_greedy_base (list_nodes_with_edge, df_edges) + layout_unique_nodes = unique (ret_recurs$nodes_positioned) + + layout_row <- list() + max_p1 <- length(layout_unique_nodes) + 1 + for (one_node in list_nodes_not_lagged) + { + layout_pos <- which (layout_unique_nodes == one_node) + if (length(layout_pos) <= 0) + { + layout_row[[ (length(layout_row)+1) ]] <- max_p1 + max_p1 <- max_p1 + 1 + } + else + layout_row[[ (length(layout_row)+1) ]] = layout_pos[[1]] + } + return ( unlist (layout_row) ) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_row_layout_layers +#------------------------------------------------------------------------------- +# Internal function to precompute a layout suited for the display of raw and +# lagged graphs +# +# This function computes the layout so that the less layers +# has a node, the more to the exteriors it will be placed. +# +# param: tmiic_res, a tmiic object returned by the execution of miic +# in temporal mode ("raw" graph_type) +# +# returns: a list, the position along an axis for each node +#------------------------------------------------------------------------------- +tmiic_compute_row_layout_layers <- function (tmiic_res) + { + n_nodes_not_lagged <- nrow(tmiic_res$state_order) + list_n_layers_back <- tmiic_res$state_order$n_layers - 1 + n_layers_back_max <- max (list_n_layers_back) + # + # Precompute the rows on the grid, putting nodes with the less lags + # on the exteriors while the nodes having the most lags are in the center + # + list_pos_of_nodes <- list() + idx_top <- 1 + idx_end <- n_nodes_not_lagged + for (n_layers_back_idx in 0:n_layers_back_max) + { + list_nodes_idx_for_layer <- which(list_n_layers_back == n_layers_back_idx) + if (length (list_nodes_idx_for_layer) > 0) { + nb_top <- (length (list_nodes_idx_for_layer) + 1) %/% 2 + nb_end <- length (list_nodes_idx_for_layer) - nb_top + i <- 1 + while (nb_top > 0) + { + node_idx <- list_nodes_idx_for_layer[[i]] + list_pos_of_nodes[[node_idx]] <- idx_top + idx_top <- idx_top + 1 + i <- i + 1 + nb_top <- nb_top - 1 + } + if (nb_end > 0) + { + i <- length(list_nodes_idx_for_layer) + while (nb_end > 0) + { + node_idx <- list_nodes_idx_for_layer[[i]] + list_pos_of_nodes[[node_idx]] <- idx_end + idx_end <- idx_end - 1 + i <- i - 1 + nb_end <- nb_end - 1 + } + } + } + } + return (unlist (list_pos_of_nodes) ) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_row_layout_sugiyama +#------------------------------------------------------------------------------- +# Internal function to precompute a layout suited for the display of raw and +# lagged graphs +# This function computes the layout using Sugiyama algorihtm to +# minimize crossing edges +# +# param: tmiic_res, a tmiic objectreturned by the execution of miic +# in temporal mode ("raw" graph_type) +# +# returns: a list, the position along an axis for each node +#------------------------------------------------------------------------------- +tmiic_compute_row_layout_sugiyama <- function (tmiic_res) + { + list_nodes_not_lagged <- tmiic_res$state_order$var_names + n_nodes_not_lagged <- length(list_nodes_not_lagged) + # + # Filter out self edges, count and summarize edges regardless their lags + # + tmiic_flat <- tmiic_flatten_network(tmiic_res) + df_edges <- tmiic_flat$all.edges.summary + df_edges <- df_edges[(df_edges$x != df_edges$y),] + if (nrow(df_edges) == 0) + df_edges$count <- integer() + else + { + for (edge_idx in 1:nrow(df_edges)) + { + one_edge <- df_edges[edge_idx,] + if (one_edge$x > one_edge$y) + df_edges[edge_idx, c("x","y")] <- c(one_edge$y, one_edge$x) + } + df_edges$count <- 1 + df_edges <- stats::aggregate(data.frame(count = df_edges$count), + by = list(x=df_edges$x, y=df_edges$y), sum) + } + # + # Create a dummy graph and apply Sugiyama algotrithm to get the layout + # + g_tmp <- igraph::graph_from_data_frame (df_edges, vertices=list_nodes_not_lagged) + nodes_layers <- rep(1,n_nodes_not_lagged) + edges_weight <- df_edges$count + ret_sugiyama <- igraph::layout_with_sugiyama (g_tmp, layers=nodes_layers, + weights=edges_weight, attributes="all") + list_pos_of_nodes <- ret_sugiyama$layout[,1] + list_pos_of_nodes <- list_pos_of_nodes + 1 + return (list_pos_of_nodes) + } + +#------------------------------------------------------------------------------- +# tmiic_compute_grid_layout +#------------------------------------------------------------------------------- +# Internal function to compute a grid layout to display raw and lagged +# graphs +# +# params: +# - tmiic_res, a tmiic object returned by the miic's execution in temporal mode. +# +# - display: string. optional, default value "raw". +# Possible values are "raw" and "lagged". +# +# - positioning: string, optional, default:"greedy". +# The method used to position nodes. +# Possible values are "none", "alphabetical", "layers", +# "greedy" and "sugiyama": +# * When positioning = "none": +# The nodes are positioned as they appear in the miic result +# * When positioning = "alphabetical": +# The nodes are positioned alphabetically in ascending order +# * When positioning = "layers": +# The nodes with the less lags wil be placed on the exteriors +# while the nodes having the most lags are in the center +# * When positioning = "greedy": +# A greedy algorithm will be used to placed the nodes in a way minimizing +# the crossing edges +# * When positioning = "sugiyama": +# The sugiyama algorithm will be used to placed the nodes in a way +# minimizing the crossing edges +# +# - orientation: character, optional, default:"L". +# The orientation of the draw. +# Possible values are landscape:"L" or portrait: "P". +# +# returns: a matrix, the layout to use for drawing +#------------------------------------------------------------------------------- +tmiic_compute_grid_layout <- function (tmiic_res, display="raw", + positioning="greedy", orientation="L") + { + if (! display %in% c("raw", "lagged") ) + stop ("Error: Invalid display parameter") + if (! positioning %in% c("none", "alphabetical", "layers", "greedy", "sugiyama") ) + stop ("Error: Invalid positioning parameter") + if (! orientation %in% c("L", "P") ) + stop ("Error: Invalid orientation parameter") + + nodes_not_lagged <- tmiic_res$state_order$var_names + n_nodes_not_lagged <- length (nodes_not_lagged) + # + # Precompute the layer 0 layout + # + list_pos_of_nodes <- NULL + if (positioning == "none") + list_pos_of_nodes = 1:n_nodes_not_lagged + if (positioning == "alphabetical") + { + list_pos_of_nodes <- list() + list_sorted <- sort (nodes_not_lagged) + for (one_node in nodes_not_lagged) + list_pos_of_nodes[[ (length(list_pos_of_nodes)+1) ]] <- which (list_sorted == one_node)[[1]] + list_pos_of_nodes <- unlist (list_pos_of_nodes) + } + if (positioning == "layers") + list_pos_of_nodes <- tmiic_compute_row_layout_layers (tmiic_res) + if (positioning == "greedy") + list_pos_of_nodes <- tmiic_compute_row_layout_greedy (tmiic_res) + if (positioning == "sugiyama") + list_pos_of_nodes <- tmiic_compute_row_layout_sugiyama (tmiic_res) + if ( is.null (list_pos_of_nodes) ) + stop ("Error: Layout can not be infered") + # + # As contextual nodes are placed in an extra column/row when display is "raw", + # here we update the nodes positions to maintain a "nice" display + # + is_contextual <- tmiic_res$state_order$is_contextual + if ( (display == "raw") & (sum(is_contextual) > 0) ) + { + list_pos_upd <- list_pos_of_nodes + # + # Identify contextual nodes + # + list_ctx_idx <- which (is_contextual != 0) + n_ctx <- length (list_ctx_idx) + # + # Identify the order we need to follow to update postions + # + list_ctx_pos <- list_pos_of_nodes[list_ctx_idx] + list_ctx_pos_order <- sort (list_ctx_pos) + # + # Distance between contextual nodes + # + max_pos <- n_nodes_not_lagged - n_ctx + ctx_pos_shift <- max(1, max_pos / (n_ctx + 1) ) + # + # Update the positions of the contextual node and shift the others + # + for (i in 1:n_ctx) + { + one_pos <- list_ctx_pos_order[[i]] + node_idx <- which (list_pos_of_nodes == one_pos) + list_pos_upd[node_idx] <- round(i * ctx_pos_shift, 0) + # + # Shift higher positions of non contextual nodes + # + node_shift <- i - 1 + pos_to_upd <- which ((is_contextual == 0) & (list_pos_upd >= one_pos - node_shift)) + list_pos_upd[pos_to_upd] <- list_pos_upd[pos_to_upd] - 1 + } + list_pos_of_nodes <- list_pos_upd + } + # + # In iGraph, drawing starts from bottom to top + # => reverse nodes order to display from top to bottom + # + max_node_pos <- max(list_pos_of_nodes) + list_pos_of_nodes <- -list_pos_of_nodes + (max_node_pos + 1) + # + # Place contextual and lag0 nodes + # + list_n_layers_back <- tmiic_res$state_order$n_layers - 1 + n_layers_back_max <- max (list_n_layers_back) + list_delta_t <- tmiic_res$state_order$delta_t + max_lags <- max (list_n_layers_back * list_delta_t) + + df_layout <- data.frame ( col=integer(), row=integer() ) + for (i in 1:n_nodes_not_lagged) + { + if (is_contextual[[i]]) + { + if (display == "raw") + col_display <- max_lags + (max_lags / max(list_n_layers_back)) + else + col_display <- 0 + } + else + col_display <- max_lags + df_layout [i,] <- c(col_display, list_pos_of_nodes[[i]]) + } + # + # Place each lagged node using its lag (layer_back * delta_t) + # + for (n_layers_back_idx in 1:n_layers_back_max) + for (node_idx in 1:n_nodes_not_lagged) + if (n_layers_back_idx <= list_n_layers_back[[node_idx]]) + { + col_display <- max_lags - n_layers_back_idx * list_delta_t[[node_idx]] + df_layout [nrow(df_layout)+1,] <- c (col_display, + list_pos_of_nodes[[node_idx]]) + } + # + # If layout orientation is portrait + # + if (orientation == "P") + { + df_layout <- df_layout[,c(2,1)] + max_pos <- max(df_layout[,1]) + df_layout[,1] <- -df_layout[,1] + (max_pos+1) + max_pos <- max(df_layout[,2]) + df_layout[,2] <- -df_layout[,2] + (max_pos+1) + } + + layout = as.matrix (df_layout) + return (layout) + } + +#------------------------------------------------------------------------------- +# plot.tmiic +#------------------------------------------------------------------------------- +#' Basic plot function of a temporal miic (tmiic) network inference result +#' +#' @description This function calls \code{\link{tmiic.export}} to build a +#' plottable object from the result returned by \code{\link{miic}} in +#' temporal mode and plot it. +#' +#' @details See the documentation of \code{\link{tmiic.export}} for further +#' details. +#' +#' @param x [a tmiic graph object] +#' The graph object returned by \code{\link{miic}} in temporal mode +#' +#' @param display [a string]. Optional, default value "compact". +#' Possible values are \emph{"raw"}, \emph{"lagged"}, \emph{"compact"}, +#' \emph{"combine"}, \emph{"unique"}, \emph{"drop"}: +#' \itemize{ +#' \item When \emph{display} = \emph{"raw"}, the plot function will +#' use the tmiic graph object as it, leading to the display of a lagged +#' graph. Unless a specific layout is specified, nodes will be positioned +#' on a grid. +#' \item When \emph{display} = \emph{"lagged"}, the function will +#' repeat the edges over history assuming stationarity and plot a lagged +#' graph. Unless a specific layout is specified, nodes will be positioned +#' on a grid. +#' \item When \emph{display} = \emph{"compact"}, the default, nodes +#' and edges are converted into a flattened version to produce a compact +#' view of the temporal network whilst still presenting all the information +#' in the plotting.\cr +#' i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 become respectively X->Y lag=1, +#' X<-Y lag=2. +#' \item When \emph{display} = \emph{"combine"}, prior to the plotting, +#' a preprocessing will be applied to kept only one edge +#' per couple of nodes. The info_shifted will be the highest one +#' of the summarized edges whilst the lag and orientation of the +#' summarized edge will be an aggregation.\cr +#' i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 will become X<->Y lag=1,2 with +#' the info_shifted of X_lag1->Y_lag0 if info_shifted of +#' X_lag1->Y_lag0 > X_lag2<-Y_lag0. +#' \item When \emph{display} = \emph{"unique"}, prior to the plotting, +#' a preprocessing will be applied to kept only the edges having the +#' highest info_shifted for a couple of nodes. +#' If several edges between the sames nodes have the same +#' info_shifted, then the edge kept is the one with the minimum lag.\cr +#' i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 with info_shifted of +#' X_lag1->Y_lag0 > X_lag2<-Y_lag0 become X->Y lag=1. +#' \item When \emph{display} = \emph{"drop"}, prior to the plotting, +#' a preprocessing will be applied to kept only the edges having the +#' highest info_shifted for a couple of nodes. +#' If several edges between the sames nodes have the same +#' info_shifted, then the edge kept is the one with the minimum lag.\cr +#' i.e. : X_lag1->Y_lag0, X_lag2<-Y_lag0 with info_shifted of +#' X_lag1->Y_lag0 > X_lag2<-Y_lag0 become X->Y. +#' The lag information is dropped during the preprocessing and +#' will not be displayed on the final plotting. +#' } +#' +#' @param show_self_loops [a boolean] Optional, TRUE by default. +#' When TRUE, the edges like X_lag0-X_lag1 are included in the iGraph object. +#' When FALSE, only edges having different nodes are present in the iGraph +#' object. +#' +#' @param positioning_for_grid [a string] Optional, "greedy" by default. +#' Used only when the display is "raw" or "lagged and no layout is supplied. +#' Possible values are \emph{"none"}, \emph{"alphabetical"}, \emph{"layers"} +#' \emph{"greedy"} and \emph{"sugiyama"} +#' \itemize{ +#' \item When \emph{positioning_for_grid} = \emph{"none"} +#' The nodes are positioned as they appear in the miic result +#' \item When \emph{positioning_for_grid} = \emph{"alphabetical"} +#' The nodes are positioned alphabeticaly in ascending order +#' \item When \emph{positioning_for_grid} = \emph{"layers"} +#' The nodes with the less lags wil be placed on the exteriors +#' while the nodes having the most lags are in the center +#' \item When \emph{positioning_for_grid} = \emph{"greedy"} +#' A greedy algorithm will be used to placed the nodes in a way minimizing +#' the crossing edges +#' \item When \emph{positioning_for_grid} = \emph{"sugiyama"} +#' The sugiyama algorithm will be used to placed the nodes in a way +#' minimizing the crossing edges +#' } +#' +#' @param orientation_for_grid [a string] Optional, "L" by default. +#' Used only when the display is "raw" or "lagged and no layout is supplied. +#' Indicates the orientation of the draw, possible values are landscape: "L" +#' or portrait: "P". +#' +#' @param method A string representing the plotting method. Default to "igraph". +#' Currently only "igraph" is supported. +#' +#' @param pcor_palette Optional. The color palette used to represent the partial +#' correlations (the color of the edges). See \code{\link{getIgraph}} for details. +#' +#' @param \dots Additional plotting parameters. See the corresponding plot function +#' for the complete list. +#' +#' For igraph, see \code{\link[igraph]{igraph.plotting}}. +#' +#' @export +#' +#' @seealso \code{\link{tmiic.export}} for generic exports, +#' \code{\link[igraph]{igraph.plotting}} +#' +#' @examples +#' \donttest{ +#' library(miic) +#' +#' #' # EXAMPLE COVID CASES (time series demo) +#' data(covidCases) +#' # execute MIIC (reconstruct graph in temporal mode) +#' tmiic_res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) +#' +#' # to plot the default compact graph +#' if(require(igraph)) { +#' plot(tmiic_res) +#' } +#' +#' # to plot the raw temporal network Using igraph +#' if(require(igraph)) { +#' plot(tmiic_res, display="raw") +#' } +#' +#' # to plot the full temporal network Using igraph +#' if(require(igraph)) { +#' plot(tmiic_res, display="lagged") +#' } +#' +#' } +#------------------------------------------------------------------------------- +plot.tmiic = function(x, display="compact", show_self_loops=TRUE, + positioning_for_grid="greedy", orientation_for_grid="L", + method = 'igraph', pcor_palette=NULL, ...) + { + if (method != 'igraph') + stop("Error: Method not supported. See ?tmiic.export for supported methods.") + if ( !base::requireNamespace("igraph", quietly = TRUE) ) + stop("Error: Package 'igraph' is required.") + if ( is.null (x$adj_matrix) ) + stop ("Error: The learnt graphical model adjacency matrix does not exist") + # + # Set a layout if none supplied by user : grid like for lagged, + # layout_with_kk for flatten displays + # + local_layout <- NULL + if ( ! ( "layout" %in% names(list(...)) ) ) + { + if (display %in% c("raw", "lagged") ) + local_layout <- tmiic_compute_grid_layout (x, display=display, + positioning=positioning_for_grid, + orientation=orientation_for_grid) + else + local_layout <- igraph::layout_with_kk + } + # + # Export the graph to a graphical object + # + graph <- tmiic.export (x, display=display, method=method, + pcor_palette=pcor_palette) + # + # Look if we have cases with multiple edges between two nodes + # or multiple self loops because we need to plot these cases iteratively. + # + df_mult <- data.frame(count=integer(), stringsAsFactors = FALSE) + if (! display %in% c("raw", "lagged") ) + { + x <- tmiic_flatten_network(x, flatten_mode=display, + keep_edges_on_same_node=show_self_loops) + x <- tmiic_prepare_edges_for_plotting(x) + df_mult <- tmiic_get_multiple_edges_for_plotting(x) + } + + if (nrow (df_mult) <= 0) + { + # If no case with multiple edges between the same nodes, we draw in one go + # + if ( is.null (local_layout) ) + igraph::plot.igraph (graph, ...) + else + igraph::plot.igraph (graph, layout=local_layout, ...) + } + else + { + # If we have a least on case with multiple edges between the same nodes, + # draw iteratively + # + df_edges <- x$all.edges.summary + edges_colors_iter <- igraph::E(graph)$color + edges_labels_iter <- igraph::E(graph)$label + # + # The first step is to draw all the graph except the multiple edges. + # The multiple edges will be drawn with invisible color "#FF000000" + # and with no labels + # + for ( edge_idx in 1:nrow(df_edges) ) + { + one_edge <- df_edges[edge_idx,] + if (one_edge$xy %in% df_mult$xy) + { + edges_colors_iter[[edge_idx]] <- "#FF000000" + edges_labels_iter[[edge_idx]] <- NA + } + } + if ( is.null (local_layout) ) + igraph::plot.igraph (graph, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, ...) + else + igraph::plot.igraph (graph, layout=local_layout, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, ...) + # + # Draw each group of multiple edges + # + for ( mult_idx in 1:nrow(df_mult) ) + { + one_mult <- df_mult[mult_idx,] + nodes_of_mult <- strsplit (one_mult$xy, "-")[[1]] + if (nodes_of_mult[[1]] == nodes_of_mult[[2]]) + { + # for self loop, we will go over 2*pi around the node + # + step_pos <- 0 + step_inc <- (2 * pi) / one_mult$count + } + else + { + # otherelse, we will curve edges from -0.5 to +0.5 + # + if (one_mult$count > 4) + { + # if more than 4 edges, curve more + # + step_pos <- -1 + step_inc <- 2.0 / (one_mult$count - 1) + } + else + { + step_pos <- -0.5 + step_inc <- 1.0 / (one_mult$count - 1) + } + } + # + # Draw multiple edges one by one. + # + # To avoid the additive effect when using transparent color for nodes, + # the color of nodes is set to NA + # + list_to_draw = which(df_edges[, "xy"] == one_mult$xy) + for (idx_to_draw in 1:length(list_to_draw) ) + { + edge_to_draw = list_to_draw[[idx_to_draw]] + # + # We hide all edges except one + # + edges_colors_iter <- rep ("#FF000000", nrow (df_edges) ) + edges_labels_iter <- rep (NA, nrow (df_edges) ) + edges_colors_iter[[edge_to_draw]] <- igraph::E(graph)[[edge_to_draw]]$color + edges_labels_iter[[edge_to_draw]] <- igraph::E(graph)[[edge_to_draw]]$label + + if (nodes_of_mult[[1]] == nodes_of_mult[[2]]) + { + if ( is.null (local_layout) ) + igraph::plot.igraph (graph, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.loop.angle=step_pos, ...) + else + igraph::plot.igraph (graph, layout=local_layout, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.loop.angle=step_pos, ...) + } + else + { + if (df_edges[edge_to_draw,]$x < df_edges[edge_to_draw,]$y) + { + if ( is.null (local_layout) ) + igraph::plot.igraph (graph, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.curved=step_pos, ...) + else + igraph::plot.igraph (graph, layout=local_layout, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.curved=step_pos, ...) + } + else # y > x we need to curve the edge on the opposite way + { + if ( is.null (local_layout) ) + igraph::plot.igraph (graph, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.curved=-step_pos, ...) + else + igraph::plot.igraph (graph, layout=local_layout, add=TRUE, + vertex.color=NA, + edge.color=edges_colors_iter, + edge.label=edges_labels_iter, + edge.curved=-step_pos, ...) + } + } + # + # Update position for next edge + # + step_pos <- step_pos + step_inc + } + } + } + } + diff --git a/R/tmiic.utils.R b/R/tmiic.utils.R new file mode 100644 index 00000000..1f0492c9 --- /dev/null +++ b/R/tmiic.utils.R @@ -0,0 +1,1272 @@ +#******************************************************************************* +# Filename : tmiic.utils.R Creation date: 10 may 2023 +# +# Description: Utility functions for temporal MIIC +# +# Author : Franck SIMON +#******************************************************************************* + +#------------------------------------------------------------------------------- +# tmiic_check_state_order_part1 +#------------------------------------------------------------------------------- +# This function performs the first part checks of the state order columns +# specific to temporal mode: n_layers, delta_t and movavg. +# In most cases, these columns will not be present at this stage, +# as these information will be likely provided as parameters +# (cf tmiic_check_parameters to see how the n_layers, delta_t and movavg +# parameters are moved into the state_order). +# Checks here are basic and cover NULL, integer type and minimal values only. +# NAs are excluded from warnings (NA = row added because var name missing) +# +# Params: +# - input_data: a dataframe with input data +# - state_order: a dataframe, the state order returned by check_state_order +# Returns: a list with 2 items: +# - state_order: the state_order, with temporal parameters eventually modified +#------------------------------------------------------------------------------- +tmiic_check_state_order_part1 <- function (state_order) + { + # n_layers check + # + if ("n_layers" %in% colnames (state_order) ) + { + wrongs = unlist (lapply (state_order$n_layers, FUN=function(x) { + if (is.na (x)) # NA: OK (missing row added before) + return (FALSE) + else if ( is.na ( suppressWarnings (as.numeric(x)) ) ) # Not num: KO + return (TRUE) + else if ( round(as.numeric(x),0) != as.numeric(x) ) # Not int: KO + return (TRUE) + else if (as.numeric(x) < 1) # Not >= 1: KO + return (TRUE) + else + return (FALSE) # OK + } ) ) + if ( any (wrongs) ) + { + msg_str <- list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("state order", "the number of layers is incorrect for", + " the variable ", msg_str, ", this value will be ignored.") + else + miic_warning ("state order", "the number of layers are incorrect for", + " several variables (", msg_str, "), these values will be ignored.") + state_order$n_layers[wrongs] = NA + } + state_order$n_layers = as.integer (state_order$n_layers) + } + # + # delta_t check + # + if ("delta_t" %in% colnames (state_order) ) + { + wrongs = unlist (lapply (state_order$delta_t, FUN=function(x) { + if (is.na (x)) # NA: OK (missing row added before) + return (FALSE) + else if ( is.na ( suppressWarnings (as.numeric(x)) ) ) # Not num: KO + return (TRUE) + else if ( round(as.numeric(x),0) != as.numeric(x) ) # Not int: KO + return (TRUE) + else if (as.numeric(x) < 0) # Not >= 1: KO + return (TRUE) + else + return (FALSE) # OK + } ) ) + if ( any (wrongs) ) + { + msg_str <- list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("state order", "the delta t is incorrect for", + " the variable ", msg_str, ", this value will be ignored.") + else + miic_warning ("state order", "the delta t are incorrect for", + " several variables (", msg_str, "), these values will be ignored.") + state_order$delta_t[wrongs] = NA + } + state_order$delta_t = as.integer (state_order$delta_t) + } + # + # movavg check + # + if ("movavg" %in% colnames (state_order) ) + { + wrongs = unlist (lapply (state_order$movavg, FUN=function(x) { + if (is.na (x)) # NA: OK (missing row added before) + return (FALSE) + else if ( is.na ( suppressWarnings (as.numeric(x)) ) ) # Not num: KO + return (TRUE) + else if ( round(as.numeric(x),0) != as.numeric(x) ) # Not int: KO + return (TRUE) + else if ( (as.numeric(x) != 0) && (as.numeric(x) < 2) ) # <2 and !=0: KO + return (TRUE) + else + return (FALSE) # OK + } ) ) + if ( any (wrongs) ) + { + msg_str <- list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("state order", "the moving average is incorrect for", + " the variable ", msg_str, ", this value will be ignored.") + else + miic_warning ("state order", "the moving average are incorrect for", + " several variables (", msg_str, "), these values will be ignored.") + state_order$movavg[wrongs] = NA + } + state_order$movavg = as.integer (state_order$movavg) + } + return (state_order) + } + +#------------------------------------------------------------------------------- +# tmiic_check_parameters +#------------------------------------------------------------------------------- +# Checks on parameters for temporal mode +# +# As the temporal parameters n_layers, delta_t, movavg need to take different +# values depending on the type (discrete/continuous) or contextual, +# these parameters are moved in the state_order to have a value defined +# for each variable (unless these information are already in the state_order, +# in such case, the parameters are ignored). +# The other temporal parameters, having one value not tuned per variable +# are added in the list of parameters. +# +# Params: +# - state_order: the dataframe returned by tmiic_check_state_order_part1 +# - params: the list of parameters (used only to add temporal parameters) +# - all possible temporal parameters of miic method +# Returns: a list with 2 items: +# - state_order: the state_order, with temporal parameters eventually added +# - params: the list of parameters with temporal parameters added +#------------------------------------------------------------------------------- +tmiic_check_parameters <- function (state_order, params, + n_layers, delta_t, movavg, keep_max_data, max_nodes) + { + # Check number of layers parameter + # + if ( ! is.null (n_layers) ) + { + if ( test_param_wrong_int (n_layers, min=2, max=NA) ) + { + if ( "n_layers" %in% colnames(state_order) ) + miic_warning ("parameters", "supplied value ", list_to_str (n_layers), + " for the number of layers is invalid,", + " if not NULL, it must be an integer >= 2.", + " This issue has no impact as the number of layers is provided", + " in the state_order.") + else + miic_warning ("parameters", "supplied value ", list_to_str (n_layers), + " for the number of layers is invalid,", + " if not NULL, it must be an integer >= 2.", + " The number of layers will be estimated from the data.") + } + else # valid n_layers + { + if ( ! ("n_layers" %in% colnames(state_order)) ) + { + state_order$n_layers = n_layers + state_order$n_layers[state_order$is_contextual == 1] = 1 + } + else # n_layers in state_order + { + na_in_so = is.na (state_order$n_layers) + if ( any (na_in_so) ) + { + miic_warning ("parameters", "the number of layers is both supplied", + " in the state_order and as a parameter. As some values are missing", + " in the state_order, the parameter will be used to fill these", + " missing values.") + state_order$n_layers[na_in_so & (state_order$is_contextual == 0)] = n_layers + } + else + miic_warning ("parameters", "the number of layers is both supplied", + " in the state_order and as parameter. The parameter will be", + " ignored.") + } + } + } + # + # Check delta_t + # + if ( ! is.null (delta_t) ) + { + if ( test_param_wrong_int (delta_t, min=1, max=NA) ) + { + if ( "delta_t" %in% colnames(state_order) ) + miic_warning ("parameters", "supplied value ", list_to_str (delta_t), + " for the delta t parameter is invalid,", + " if not NULL, it must be an integer >= 1.", + " This issue has no impact as the delta t is provided", + " in the state_order.") + else + miic_warning ("parameters", "supplied value ", list_to_str (delta_t), + " for the delta t parameter is invalid,", + " if not NULL, it must be an integer >= 1.", + " The delta t will be estimated from the data.") + } + else # valid delta_t + { + if ( ! ("delta_t" %in% colnames(state_order)) ) + { + state_order$delta_t = delta_t + state_order$delta_t[state_order$is_contextual == 1] = 0 + } + else # delta_t in state_order + { + na_in_so = is.na (state_order$delta_t) + if ( any (na_in_so) ) + { + miic_warning ("parameters", "the delta t is both supplied", + " in the state_order and as a parameter. As some values are missing", + " in the state_order, the parameter will be used to fill these", + " missing values.") + state_order$delta_t[na_in_so & (state_order$is_contextual == 0)] = delta_t + } + else + miic_warning ("parameters", "the delta t is both supplied", + " in the state_order and as a parameter. The parameter will be", + " ignored.") + } + } + } + # + # Check movavg + # + if ( ! is.null (movavg) ) + { + if ( test_param_wrong_int (movavg, min=0, max=NA) + || (movavg == 1) ) + { + if ( "movavg" %in% colnames(state_order) ) + miic_warning ("parameters", "supplied value ", list_to_str (movavg), + " for the moving average parameter is invalid,", + " if not NULL or 0, it must be an integer >= 2.", + " This issue has no impact as the moving average is provided", + " in the state_order.") + else + miic_warning ("parameters", "supplied value ", list_to_str (movavg), + " for the moving average parameter is invalid,", + " if not NULL or 0, it must be an integer >= 2.", + " The moving average parameter will be ignored.") + } + else # valid movavg + { + if ( ! ("movavg" %in% colnames(state_order)) ) + { + state_order$movavg = movavg + # No movavg on discrete or contextual vars + state_order$movavg[state_order$var_type == 0] = 0 + state_order$movavg[state_order$is_contextual == 1] = 0 + } + else # movavg in state_order + { + na_in_so = is.na (state_order$movavg) + if ( any (na_in_so) ) + { + miic_warning ("parameters", "the moving average is both supplied", + " in the state_order and as a parameter. As some values are missing", + " in the state_order, the parameter will be used to fill these", + " missing values.") + state_order$movavg[ na_in_so + & (state_order$var_type == 1) + & (state_order$is_contextual == 0)] = movavg + } + else + miic_warning ("parameters", "the moving average is both supplied", + " in the state_order and as a parameter. The parameter will be", + " ignored.") + } + } + } + + params$keep_max_data = check_param_logical (keep_max_data, "keep_max_data", FALSE) + params$max_nodes = check_param_int (max_nodes, "maximum number of lagged nodes", + default=50, min=nrow(state_order)+1) + + return (list ("params"=params, "state_order"=state_order)) + } + +#------------------------------------------------------------------------------- +# tmiic_check_state_order_part2 +#------------------------------------------------------------------------------- +# Second part of the check state order for temporal mode. +# This function is designed to be called after the check_parameters_temporal +# function has moved (if needed) the n_layers, delta_t and movavg parameters +# into the state_order. +# This function will try to fill possible missing values and will check/fix +# the temporal settings against the var_type and is_contextual information. +# +# Params : +# - state_order: a dataframe, the state order returned by tmiic_check_parameters +# Returns: a list with 2 items: +# - state_order: the state_order, with temporal parameters eventually modified +#------------------------------------------------------------------------------- +tmiic_check_state_order_part2 <- function (state_order) + { + # Check the n_layers column in the state order. + # + # The check_state_order function has already checked NULL, not integer and <1 + # values, they were turned into NA. So, no need to check here these cases + # + if ( ! ("n_layers" %in% colnames(state_order)) ) + { + # Add n_layers column with 1 for is_contextual, NA otherwise + # + state_order$n_layers = NA + state_order$n_layers[ state_order$is_contextual == 1] = 1 + } + else + { + # Replace NA values if any + # + na_in_so = is.na (state_order$n_layers) + are_contextual = (state_order$is_contextual == 1) + if ( any (na_in_so & are_contextual) ) + { + # For contextual, replace NAs with 1 + # + msg_str = list_to_str (state_order$var_names[na_in_so & are_contextual], + n_max=10) + miic_warning ("temporal checks", "the missing number of layers have been", + " set to 1 for contextual variables (", msg_str, ").") + state_order$n_layers[na_in_so & are_contextual] = 1 + } + # + # Contextual vars done, look if still NAs on not contextual + # + na_in_so = is.na (state_order$n_layers) + if ( any (na_in_so) ) + { + # For non contextual vars with n_layers equal to NA: + # - if no other var has a n_layers, go for automatic estimate + # - if there is an unique n_layers, apply this value to all + # - if there is multiple n_layers values, stop + # + uniq_vals = unique (state_order$n_layers[(!na_in_so) & (!are_contextual)]) + if (length (uniq_vals) == 0) + { + msg_str = list_to_str (state_order$var_names[na_in_so], n_max=10) + miic_warning ("temporal checks", "the missing number of layers will be ", + " determined from data for variables ", msg_str, ".") + } + else if (length (uniq_vals) > 1) + { + miic_error ("temporal checks", + "some number of layers are missing and they can not be completed", + " automatically as multiple values are already present.") + } + else + { + msg_str = list_to_str (state_order$var_names[na_in_so], n_max=10) + miic_warning ("temporal checks", "the missing number of layers will be ", + " set to ", uniq_vals, " for variables ", msg_str, ".") + state_order$n_layers[na_in_so & are_contextual] = uniq_vals + } + } + # + # Check/fix invalid values: for contextual vars, n_layers must be 1 + # + wrongs = ( ( ! is.na (state_order$n_layers) ) + & (state_order$n_layers != 1) + & (state_order$is_contextual == 1) ) + if ( any (wrongs) ) + { + msg_str = list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("temporal checks", "the variable ", msg_str, ", as", + " contextual, has an invalid number of layers. It will be set to 1.") + else + miic_warning ("temporal checks", "several variables (", msg_str, "), as", + " contextual, have an invalid number of layers. They will be set to 1.") + state_order$n_layers[wrongs] = 1 + } + # + # Warning if multiple values of n_layers excluding contextual + # + uniq_vals = unique (state_order$n_layers[ (!is.na (state_order$n_layers)) + & (state_order$is_contextual == 0) ]) + if (length (uniq_vals) > 1) + { + msg_str = list_to_str (uniq_vals) + miic_warning ("temporal checks", "different values (", msg_str, + ") have be defined for the number of layers.", + " Such setting should be avoided unless \"specific\" reason", + " as the result will likely not be accurate.") + } + # + # Stop if all nb layers == 1 + # + if ( (!any (is.na (state_order$n_layers))) + && (all (state_order$n_layers <= 1)) ) + miic_error ("temporal checks", "there must be one variable", + " at least with a number of layers > 1.") + } + # + # Check state order delta_t (idem as n_layers) + # + if ( ! ("delta_t" %in% colnames(state_order)) ) + { + # Add delta_t column with 0 for contextual, NA otherwise + # + state_order$delta_t = NA + state_order$delta_t[ state_order$is_contextual == 1] = 0 + } + else + { + # Replace NA vals if possible: + # + na_in_so = is.na (state_order$delta_t) + are_contextual = (state_order$is_contextual == 1) + if ( any (na_in_so & are_contextual) ) + { + # For contextual, replace NAs with 1 + # + msg_str = list_to_str (state_order$var_names[na_in_so & are_contextual], + n_max=10) + miic_warning ("temporal checks", "the missing delta t have been", + " set to 0 for contextual variables (", msg_str, ").") + state_order$delta_t[na_in_so & are_contextual] = 0 + } + # + # Contextual vars done, look if still NAs on not contextual + # + na_in_so = is.na (state_order$delta_t) + if ( any (na_in_so) ) + { + # For non contextual vars with delta_t equal to NA: + # - if no other var has a delta_t, go for automatic estimate + # - if there is an unique delta_t, apply this value to all + # - if there is multiple delta_t values, stop + # + uniq_vals = unique (state_order$delta_t[(!na_in_so) & (!are_contextual)]) + if (length (uniq_vals) == 0) + { + msg_str = list_to_str (state_order$var_names[na_in_so], n_max=10) + miic_warning ("state order", "the missing delta t will be ", + " determined from data for variables ", msg_str, ".") + } + else if (length (uniq_vals) > 1) + { + miic_error ("state_order", "the state order contains NAs", + " for the delta t and it can not be completed", + " automatically as multiple values are already present.") + } + else + { + msg_str = list_to_str (state_order$var_names[na_in_so], n_max=10) + miic_warning ("state order", "the missing delta t will be ", + " set to ", uniq_vals, " for variables ", msg_str, ").") + state_order$delta_t[na_in_so & are_contextual] = uniq_vals + } + } + # + # Check/fix invalid values: for contextual vars, delta_t must be 0 + # + wrongs = ( (!is.na (state_order$delta_t)) + & (state_order$delta_t != 0) + & (state_order$is_contextual == 1) ) + if ( any (wrongs) ) + { + msg_str = list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("temporal checks", "the variable ", msg_str, ", as", + " contextual, has an invalid delta t. It will be set to 0.") + else + miic_warning ("temporal checks", "several variables (", msg_str, "), as", + " contextual, have an invalid delta_t. They will be set to 0.") + state_order$delta_t[wrongs] = 0 + } + # + # Warning if multiple values of delta_t (excluding contextual) are present + # + uniq_vals = unique (state_order$delta_t[ (!is.na (state_order$delta_t)) + & (state_order$is_contextual == 0) ]) + if (length (uniq_vals) > 1) + { + msg_str = list_to_str (uniq_vals) + miic_warning ("temporal checks", "different values (", msg_str, + ") have be defined for the delta t.", + " Such setting should be avoided unless \"specific\" reason", + " as the result will likely not be accurate.") + } + # + # Stop if all delta t == 0 + # + if ( (!any (is.na (state_order$delta_t))) + && (all (state_order$delta_t <= 0)) ) + miic_error ("temporal checks", + "there must be one variable at least with a delta t > 0.") + } + # + # Check state order movavg + # + if ( ! ("movavg" %in% colnames(state_order)) ) + { + # Add movavg column with 0 for all vars + # + state_order$movavg = 0 + } + else + { + # Replace NA vals by 0 + # + na_in_so = is.na (state_order$movavg) + if ( any (na_in_so) ) + { + msg_str = list_to_str (state_order$var_names[na_in_so], n_max=10) + miic_warning ("state order", "the missing moving average have been", + " set to 0 for variables ", msg_str) + state_order$movavg[na_in_so] = 0 + } + # + # Check/fix invalid values: for discrete vars, no moving average + # + wrongs = ( (state_order$movavg != 0) & (state_order$var_type == 0) ) + if ( any (wrongs) ) + { + msg_str = list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("temporal checks", "a moving average cannot be applied", + " on a discrete variable ", msg_str, ".") + else + miic_warning ("temporal checks", "moving average operations cannot", + " be applied on discrete variables (", msg_str, ").") + state_order$movavg[wrongs] = 0 + } + # + # Check/fix invalid values: for contextual vars, no moving average + # + wrongs = ( (state_order$movavg != 0) & (state_order$is_contextual == 1) ) + if ( any (wrongs) ) + { + msg_str = list_to_str (state_order$var_names[wrongs], n_max=10) + if (sum (wrongs) == 1) + miic_warning ("temporal checks", "a moving average can not be applied", + " on the contextual variable ", msg_str, ".") + else + miic_warning ("temporal checks", "moving average operations can not", + " be applied on contextualvariables (", msg_str, ").") + state_order$movavg[wrongs] = 0 + } + # + # Warning if multiple values of moving average excluding discrete and contextual + # + uniq_vals = unique (state_order$movavg[ (!is.na (state_order$movavg)) + & (state_order$var_type == 1) + & (state_order$is_contextual == 0) ]) + if (length (uniq_vals) > 1) + { + msg_str = list_to_str (uniq_vals) + miic_warning ("temporal checks", "different values (", msg_str, + ") have be defined for the moving averages.") + } + } + # + # Cross checks + # + if ( ( "n_layers" %in% colnames(state_order) ) + && ( "delta_t" %in% colnames(state_order) ) ) + { + if ( (!any (is.na (state_order$n_layers))) + && (!any (is.na (state_order$delta_t))) ) + { + t_max = state_order$n_layers * state_order$delta_t + t_max = t_max[state_order$is_contextual == 0] + if ( all (t_max < 2) ) + miic_error ("temporal checks", "there must be one variable", + " at least with 2 layers and a delta t >= 1.") + } + } + return (state_order) + } + +#------------------------------------------------------------------------------- +# tmiic_check_after_lagging +#------------------------------------------------------------------------------- +# Check the data and the lagged state order: in the state_order, the var_type +# may need to be re-evaluated after lagging as some numerical lagged variables +# can have less unique values and are no more considered as discrete +# +# Params : +# - lagged_data: a dataframe, the lagged input data +# - lagged_so : a dataframe, the lagged state order +# Returns: +# - a dataframe: the lagged state_order, eventually modified +#------------------------------------------------------------------------------- +tmiic_check_after_lagging <- function (lagged_data, lagged_so) + { + cols_only_na <- colSums (is.na (lagged_data)) == nrow (lagged_data) + if ( any (cols_only_na) ) + { + if ( sum (cols_only_na) == 1) + miic_warning ("lagged data", "the variable ", colnames(lagged_data)[cols_only_na], + " contains only NAs after lagging.") + else + miic_warning ("lagged data", sum(cols_only_na), " variables (", + list_to_str (colnames(lagged_data)[cols_only_na], n_max=10), + ") contains only NAs after lagging.") + } + + n_unique_vals <- unlist (lapply (lagged_data, function (x) { + length (unique (x[!is.na(x)] ) ) } ) ) + for (i in 1:nrow (lagged_so)) + { + if (n_unique_vals[[i]] == 1) + { + miic_warning ("lagged data", "the variable ", lagged_so[i, "var_names"], + " is constant after lagging.") + lagged_so[i, "var_type"] = 0 + next + } + if (lagged_so[i, "var_type"] == 0) + next + if (n_unique_vals[[i]] == 2) + { + if ( ("var_type_specified" %in% colnames(lagged_so) ) + && (lagged_so[i, "var_type_specified"]) ) + miic_warning ("lagged data", "the variable ", lagged_so[i, "var_names"], + " was specified as continuous but contains only two values", + " after lagging. It will be considered as discrete.") + lagged_so[i, "var_type"] <- 0 + next + } + if (n_unique_vals[[i]] < MIIC_CONTINUOUS_TRESHOLD) + { + if ( (! ("var_type_specified" %in% colnames(lagged_so) ) ) + || (!lagged_so[i, "var_type_specified"]) ) + lagged_so[i, "var_type"] <- 0 + } + } + lagged_so$var_type_specified <- NULL + return (lagged_so) + } + +#------------------------------------------------------------------------------- +# tmiic_check_other_df_after_lagging +#------------------------------------------------------------------------------- +# Check the optional dataframe true edges or black box after lagging +# +# Params : +# - var_names: a list, the list of llaged variables names +# - lagged_df: a dataframe, the lagged true edges or black box +# - df_name: the dataframe name, "true edges" or "black box" +# Returns: +# - a dataframe: the lagged dataframe, eventually modified +#------------------------------------------------------------------------------- +tmiic_check_other_df_after_lagging <- function (var_names, lagged_df, df_name) + { + all_varnames_in_df = unique (c (lagged_df[,1], lagged_df[,1]) ) + vars_absent = ( ! (all_varnames_in_df %in% var_names) ) + if (any (vars_absent)) + { + if (sum (vars_absent) == 1) + miic_warning (df_name, "the variable ", all_varnames_in_df[vars_absent], + " is not present in the lagged data.", + " Row(s) with this variable will be ignored.") + else + miic_warning (df_name, "several variables (", + list_to_str (all_varnames_in_df[vars_absent], n_max=10), + ") are not present in the lagged data.", + " Row(s) with these variables will be ignored.") + } + rows_ok = ( (lagged_df[,1] %in% var_names) + & (lagged_df[,2] %in% var_names) ) + lagged_df = lagged_df[rows_ok, ] + return (lagged_df) + } + +#------------------------------------------------------------------------------- +# tmiic_extract_trajectories +#------------------------------------------------------------------------------- +# Extract the trajectories from a data frame and return them in a list +# - input_data: a data frame with the time steps in the 1st column +# A new trajectory is identified when time step < previous time step +# - check: optional, default=T. Emit warnings when: +# * there is a gap between 2 consecutive time steps +# * the time step value is not incremented between 2 consecutive rows +# * the 1st time step of a trajectory is not 1 +# Returns: +# - a list: the list of trajectories +# Note the the time step information in each trajectory is renumbered from 1 +# to number of time steps of the trajectory (so no gap, no unchanged time step) +#------------------------------------------------------------------------------- +tmiic_extract_trajectories <- function (input_data, check=T) + { + timesteps = input_data[, 1] + if ( any ( is.na (timesteps) ) ) + miic_error ("trajectories check", "the time step column (column 1) contains NA(s)") + if ( ! all (is.numeric (timesteps)) ) + miic_error ("trajectories check", "the time step column (column 1) is not integer") + if ( ! all (round (timesteps, 0) == timesteps) ) + miic_error ("trajectories check", "the time step column (column 1) is not integer") + timesteps = as.integer(timesteps) + timesteps_next = c (timesteps[2:length(timesteps)], 0) + breaks = which (timesteps_next < timesteps) + + list_ts <- list() + row_prev = 1 + for ( i in 1:length (breaks) ) + { + row_new = breaks[[i]] + list_ts[[i]] = input_data[row_prev:row_new,] + row_prev = row_new + 1 + } + + if (check) + { + no_inc = which (timesteps_next == timesteps) + if (length (no_inc) > 0) + miic_warning ("check trajectories", "time step value unchanged at ", + length (no_inc), " position(s)") + gaps = which (timesteps_next > timesteps + 1) + if (length (gaps) > 0) + miic_warning ("check trajectories", "gap in time step values at ", + length (gaps), " position(s)") + wrong_starts = which ( unlist (lapply (list_ts, + FUN=function (x) { return (x[1,1] != 1) } ) ) ) + if (length (wrong_starts) > 0) + miic_warning ("check trajectories", length (wrong_starts), + " trajectorie(s) don't start with 1 as first time step value") + max_nb_ts = max (unlist (lapply (list_ts, FUN=nrow) ) ) + if (max_nb_ts == 1) + miic_error ("trajectories check", + "all trajectories have only 1 time step.") + } + for ( i in 1:length (list_ts) ) + list_ts[[i]][,1] = 1:nrow (list_ts[[i]]) + return (list_ts) + } + +#------------------------------------------------------------------------------- +# tmiic_group_trajectories +#------------------------------------------------------------------------------- +# Merge a list of trajectories into a data frame +# - list_ts: the list of trajectories +# - drop_timestep: boolean, FALSE by default. Drop the time step information +# (the 1st column) in the returned data frame +# Returns: +# - a dataframe: data frame with all the trajectories +#------------------------------------------------------------------------------- +tmiic_group_trajectories = function (list_ts, drop_timestep=FALSE) + { + # Pre-allocate the data frame with the same structure as trajectories + # and the same number of rows as all the trajectories + # + df = list_ts[[1]][FALSE,] + n_row_tot = sum (unlist (lapply(list_ts, nrow))) + df <- df[seq_len(n_row_tot),] + rownames(df) <- NULL + + row_idx = 1 + for (i in 1:length(list_ts) ) + { + df[row_idx:(row_idx-1+nrow(list_ts[[i]])),] = list_ts[[i]] + row_idx = row_idx + nrow(list_ts[[i]]) + } + if (drop_timestep) + df = df[,-1] + return (df) + } + +#------------------------------------------------------------------------------- +# tmiic_movavg_onecol +#------------------------------------------------------------------------------- +# Utility function to a apply a moving average over a list +# params: +# - x: the list +# - w: the length of the window +# This moving average is centered, so the first (w-1) %/% 2 and the last +# (w-1) - low_shift items will be filled with NA_real_ +#------------------------------------------------------------------------------- +tmiic_movavg_onecol = function (x, w) + { + low_shift = (w-1) %/% 2 + high_shift = (w-1) - low_shift + ret = rep(-1, length(x)) + ret[1:low_shift] = NA_real_ + + start_idx = low_shift+1 + end_idx = length(x) - high_shift + for (i in start_idx:end_idx) + { + idx_low = i - low_shift + idx_high = i + high_shift + ret[i] <- mean (x[idx_low:idx_high], na.rm=TRUE) + } + ret[(end_idx+1):length(ret)] = NA_real_ + return (ret) + } + +#------------------------------------------------------------------------------- +# tmiic_movavg +#------------------------------------------------------------------------------- +# Apply moving averages on data +# - list_ts: a list of dataframe, each item representing a trajectory. +# Each dataframe must contain the time step information in the 1st column +# and the variables in the other columns. +# - movavg: the list of moving average to be applied, optional, NULL by defaut. +# The length of the movavg list is the number of columns of the dataframes - 1 +# (because the 1st column in dataframes is the time step). +# When the movavg item value is >= 2, a moving average using this value as +# window size is applied on the corresponding column: +# movavg item 1 is applied data column 2, moavg item 2 to data column 3, ... +# - keep_max_data: boolean flag, optional, FALSE by default +# When FALSE, the rows containing NA introduced by the moving average(s) +# are deleted, otherwise when TRUE, the rows are kept +# - verbose_level: integer in the range [0,2], 1 by default. The level of +# verbosity: 0 = no display, 1 = summary display, 2 = maximum display. +# Returns: +# - list_ts: the list trajectories with moving averages applied +#------------------------------------------------------------------------------- +tmiic_movavg = function (list_ts, movavg=NULL, keep_max_data=F, verbose_level=0) + { + if ( is.null (movavg) || all (movavg < 2) ) + return (list_ts) + if (verbose_level >= 1) + miic_msg ("Applying moving averages...") + # Apply movavg on each trajectory and variable of the dataset + # + n_vars = ncol(list_ts[[1]])-1 + var_names = colnames (list_ts[[1]])[-1] + for (i in 1:length(list_ts) ) + for (j in 1:n_vars) + if (movavg[[j]] >= 2) + { + # print (paste0 (j, " => movavg = ", movavg[[j]])) + list_ts[[i]][,j+1] = tmiic_movavg_onecol (list_ts[[i]][,j+1], movavg[[j]]) + if (verbose_level == 2) + miic_msg ("- ", var_names[[j]], ": moving average of window size ", + movavg[[j]], " applied") + } + # + # Remove starting and ending rows where NAs were introduced + # + if (!keep_max_data) + { + movavg_max = max(movavg) + low_shift = (movavg_max-1) %/% 2 + high_shift = (movavg_max-1) - low_shift + start_idx = 1 + if (low_shift > 0) + start_idx = start_idx + low_shift + i = 1 + for (i in 1:length(list_ts) ) + { + end_idx = nrow(list_ts[[i]]) - high_shift + list_ts[[i]] = list_ts[[i]][start_idx:end_idx,] + } + } + return (list_ts) + } + +#----------------------------------------------------------------------------- +# tmiic_ajust_window_for_nb_samples +#----------------------------------------------------------------------------- +# Reduce the window size (n_layers or delta_t) if the foreseen n_layers and +# delta_t would lead to too few samples after data lagging +# params: +# - list_ts: a list of dataframe, each item representing a trajectory. +# - n_layers: a list, the n_layers in the state_order column +# - delta_t: a list, the delta_t in the state_order column +# - reduced_param: a string, can be "n_layers" or "delta_t". Indicates with +# parameter will be reduced if the number of samples is too small +# - verbose: boolean, if TRUE, display a message if the window size is reduced +# returns: +# - a list: the n_layers or delta_t, depending of the reduced_param value. +# The value are possibly decreased to reduce the window size +#----------------------------------------------------------------------------- +tmiic_ajust_window_for_nb_samples <- function (list_ts, n_layers, delta_t, + reduced_param, verbose) + { + tau_per_var = (n_layers - 1) * delta_t + tau_max = max (tau_per_var) + ts_lengths = unlist ( lapply (list_ts, nrow) ) + tot_ts = sum (ts_lengths) + nb_samples = sum ( unlist (lapply (ts_lengths, FUN=function (x) { + max (0, x - tau_max) } ) ) ) + target = min (1000, tot_ts / 10) + + if (nb_samples < target) + { + # Look for the best value to reach the target recursively + # At each iteration, we keep the half part where the best value is + # until we can not divide in half further + # + recurs_eval = function (target, tau_low, tau_high, ts_lengths) + { + if (tau_high - tau_low <= 1) + return (tau_low) + tau = round ( (tau_low + tau_high) / 2, 0) + nb_samples = sum ( unlist (lapply (ts_lengths, FUN=function (x) { + max (0, x - tau) } ) ) ) + if (nb_samples >= target) + tau_ret = recurs_eval (target, tau, tau_high, ts_lengths) + else + tau_ret = recurs_eval (target, tau_low, tau, ts_lengths) + return (tau_ret) + } + tau_red = recurs_eval (target, 1, tau_max, ts_lengths) + # + # Max time steps back in time found, try to reduce n_layers or delta_t + # + if (reduced_param == "n_layers") + n_layers[tau_per_var > tau_red] = max ( 2, + floor (tau_red / delta_t[tau_per_var > tau_red]) + 1) + else # reduce delta_t + delta_t[tau_per_var > tau_red] = max ( 1, + floor (tau_red / (n_layers[tau_per_var > tau_red] - 1)) ) + # + # Check the effect of reduction and feed back to user + # + if (reduced_param == "n_layers") + fixed_param = "delta_t" + else + fixed_param = "n_layers" + tau_max_red = max ( (n_layers - 1) * delta_t ) + nb_samples_red = sum ( unlist (lapply (ts_lengths, FUN=function (x) { + max (0, x - tau_max_red) } ) ) ) + if (nb_samples_red <= 0) + miic_error ("temporal parameters estimation", + "with the values supplied in ", fixed_param, + ", no valid ", reduced_param, " can be estimated.") + else if ( (nb_samples_red < target) && (nb_samples_red == nb_samples) ) + miic_warning ("temporal parameters estimation", + "with the estimated or supplied temporal parameters", + ", the number of usable samples will be ", nb_samples, + ". Consider to specify manually n_layers and delta_t.") + else if (nb_samples_red < target) + miic_warning ("temporal parameters estimation", + "the ", reduced_param, " parameter has been reduced", + " to increase the number of samples. However,", + " the number of usable samples will still only be ", nb_samples_red, + ". Consider to specify manually n_layers and delta_t.") + else if (verbose) + miic_msg ("- The ", reduced_param, " parameter has been reduced ", + " to increase the number of samples.") + } + + if (reduced_param == "n_layers") + return (n_layers) + else + return (delta_t) + } + +#------------------------------------------------------------------------------- +# tmiic_estimate_dynamic +#------------------------------------------------------------------------------- +# Estimate tau (the number of total time steps back to cover the dynamic, +# the number of layers and delta t parameters from the data +# - list_ts: list of dataframe, each item representing a trajectory. +# Each dataframe must contain the time step information in the 1st column +# and the variables in the other columns. +# - state_order: the state_order dataframe. This state_order is expected +# having being checked by the temporal check functions of inputs. +# The rows in the state_order must be ordered as the columns in the data. +# It must contain the var_type, is_contextual, n_layers and delta_t columns. +# There can be NAs in n_layers and delta_t for continuous and non contextual +# variables. +# - max_nodes: maximum number of nodes in the inferred time unfolded graph, +# optional, 50 by default +# - verbose_level: integer in the range [0,2], 1 by default. The level of +# verbosity: 0 = no display, 1 = summary display, 2 = maximum display. +#----------------------------------------------------------------------------- +tmiic_estimate_dynamic <- function (list_ts, state_order, max_nodes=50, + verbose_level=1) + { + # + # If n_layers and delta_t all defined, nothing to do + # + if ( (! any (is.na (state_order$n_layers) ) ) + && (! any (is.na (state_order$delta_t ) ) ) ) + return (state_order) + # + # We are going to estimate to temporal dynamic because we need to fill out + # the missing values in n_layers and/or delta_t. + # + # After the checks done on the state order, we know that + # the missing values are only for continuous and non contextual. + # In addition, we know that all values for continuous and non contextual + # are NAs (otherwise the checks would have completed the NAs by + # generalizing the known values) + # + if (verbose_level == 2) + miic_msg ("Estimating the temporal dynamic...") + n_ts = length (list_ts) + n_vars_tot = ncol (list_ts[[1]]) - 1 + n_vars_ctx = sum (state_order$is_contextual) + n_vars_lag = n_vars_tot - n_vars_ctx + # + # Remove time step, contextual and discrete variables + # + for (ts_idx in 1:n_ts) + { + if (nrow (list_ts[[ts_idx]]) == 1) + miic_warning ("Dynamic estimation", "trajectory ", ts_idx, + " with only 1 time step is ignored for dynamic estimation") + list_ts[[ts_idx]] = list_ts[[ts_idx]][, c(F, ( (!state_order$is_contextual) + & (state_order$var_type) ) )] + } + # + # Compute mean alpha per variable + # + n_vars = ncol (list_ts[[1]]) + var_names = colnames(list_ts[[1]]) + length_to_test = min (unlist (lapply (list_ts, FUN=function (x) { + ifelse ( nrow(x) <= 1, NA, nrow(x) ) } ) ), na.rm=T) + alphas_per_var = rep (NA, n_vars) + taus_per_var = rep (NA, n_vars) + var_idx = 1 + for (var_idx in 1:n_vars) + { + alphas_per_ts = rep (NA, n_ts) + ts_idx = 1 + for (ts_idx in 1:n_ts) + { + if (nrow (list_ts[[ts_idx]]) == 1) + next + acf_res = stats::acf (list_ts[[ts_idx]][,(var_idx)], + na.action=stats::na.pass, lag.max=length_to_test-1, plot=F) + if ( all (is.na(acf_res$acf) ) ) + next + acf_vanish = which (acf_res$acf[,1,1] < 0.05) + if ( length (acf_vanish) == 0 ) + acf_vanish = length_to_test + lag_vanish = acf_res$lag[min (acf_vanish), 1, 1] + lag_4_alpha = max ( 1, round (lag_vanish / 2) ) + alphas_per_ts[[ts_idx]] = acf_res$acf[lag_4_alpha+1,1,1] ^ (1/lag_4_alpha) + } + alphas_per_var[[var_idx]] = mean (alphas_per_ts, na.rm=T) + taus_per_var[[var_idx]] = round ( (1+alphas_per_var[[var_idx]]) + / (1-alphas_per_var[[var_idx]]) ) + } + if (verbose_level == 2) + { + miic_msg ("Tau per variable:") + for (i in 1:length (var_names)) + miic_msg ("- ", var_names[[i]], ": ", taus_per_var[[i]]) + } + # + # Compute alphas range and deduce taus range + # + tau_min = max ( 1, min (taus_per_var, na.rm=T) ) + tau_mean = max ( 1, round (mean (taus_per_var, na.rm=T), 0) ) + tau_max = min ( length_to_test, max (taus_per_var, na.rm=T) ) + tau_max_kept = min (length_to_test, tau_max, tau_mean * 2) + tau = tau_max_kept + if (verbose_level >= 1) + miic_msg ("Automatic estimation of parameters:\n", + "- Relaxation times goes from ", tau_min, " to ", tau_max, + " with a mean of ", tau_mean, " => tau max considered = ", tau_max_kept) + # + # We know tau : the average maximum time steps back in time to use for the + # temporal discovery. Now estimate the number of layers 'n_layers' + # and/or number of time steps between two layers 'delta_t' + # + if ( all (!is.na (state_order$n_layers)) ) # n_layers known => NAs in delta_t + { + state_order$delta_t[is.na(state_order$delta_t)] = max ( 1, + ceiling (tau / (state_order$n_layers[is.na(state_order$delta_t)] - 1)) ) + + state_order$delta_t = tmiic_ajust_window_for_nb_samples (list_ts, + state_order$n_layers, state_order$delta_t, reduced_param="delta_t", + verbose=(verbose_level >= 1) ) + + uniq_n_layers = unique (state_order$n_layers[ (state_order$is_contextual == 0) + & (state_order$var_type == 1)] ) + uniq_delta_t = unique (state_order$delta_t[ (state_order$is_contextual == 0) + & (state_order$var_type == 1)] ) + if (verbose_level >= 1) + { + if (length (uniq_n_layers) == 1) + miic_msg ("- As the number of layers was defined to ", uniq_n_layers, + ", the only parameter tuned is the delta t set to ", uniq_delta_t, ".") + else + miic_msg ("- As multiple values of layers were present (", + list_to_str (uniq_n_layers), "), the delta t have been set,", + " respectively to ", list_to_str (uniq_delta_t), ".") + } + } + else if ( all (!is.na (state_order$delta_t)) ) # delta_t known => NAs in n_layers + { + # To determine the layers, we compute the max number of layers considering + # the maximum number of nodes in the final grpah + # + n_layers_max = max (2, floor ( (max_nodes - n_vars_ctx) / n_vars_lag ) ) + # + # The final number of layers will (tau / delta_t) + 1 unless if greater + # than the max number of layers + # + state_order$n_layers[is.na(state_order$n_layers)] = min (n_layers_max, + ceiling (tau / state_order$delta_t[is.na(state_order$n_layers)]) + 1) + + state_order$n_layers = tmiic_ajust_window_for_nb_samples (list_ts, + state_order$n_layers, state_order$delta_t, reduced_param="n_layers", + verbose=(verbose_level >= 1) ) + + uniq_n_layers = unique (state_order$n_layers[ (state_order$is_contextual == 0) + & (state_order$var_type == 1)] ) + uniq_delta_t = unique (state_order$delta_t[ (state_order$is_contextual == 0) + & (state_order$var_type == 1)] ) + if (verbose_level >= 1) + { + if (length (uniq_delta_t) == 1) + miic_msg ("- As the value of delta t was defined to ", uniq_delta_t, + ", the only parameter tuned is the number of layers set to ", + uniq_n_layers, ".") + else + miic_msg ("- As multiple values of delta t were present (", + list_to_str (uniq_delta_t), "), the number of layers have been set,", + " respectively to ", list_to_str (uniq_n_layers), ".") + } + } + else + { + # Both n_layers and delta_t need to be estimated automatically + # + delta_t = 1 + if ( (tau + 1) * n_vars_lag + n_vars_ctx <= max_nodes) + { + # If when using delta_t = 1, the n_layers (= tau + 1) does not lead to + # a graph with a total number of nodes > max => OK, nothing more to do + # + n_layers = tau + 1 + } + else + { + # We need reduce the number of layers to respect the maximum nodes number + # and increase de delta t to still cover all the dynamic tau. + # => Compute the max number of layers and deduce the delta t + # + n_layers = max (2, floor ( (max_nodes - n_vars_ctx) / n_vars_lag ) ) + if (n_layers > 2) + { + delta_t = max (1, ceiling ( tau / (n_layers-1) ) ) + tau = (n_layers - 1) * delta_t + } + else + delta_t = tau + } + + state_order$n_layers[is.na(state_order$n_layers)] = n_layers + state_order$delta_t[is.na(state_order$delta_t)] = delta_t + + state_order$delta_t = tmiic_ajust_window_for_nb_samples (list_ts, + state_order$n_layers, state_order$delta_t, reduced_param="delta_t", + verbose=(verbose_level >= 1) ) + delta_t = unique (state_order$delta_t[ (state_order$var_type == 1) + & (state_order$is_contextual == 0) ]) + + if (verbose_level >= 1) + miic_msg ("- For a final graph with a target of ", max_nodes, + " nodes having ", n_vars_lag, " lagged variables", + ifelse (n_vars_ctx > 0, paste0 ("\n and ", n_vars_ctx, " contextual variables"), ""), + ":\n ", n_layers, " layers spaced by ", delta_t, " time steps", + ", dynamic covered goes over t, t-", delta_t, + ifelse (n_layers > 3, ", ...", ""), + ifelse (n_layers > 2, paste0 (", t-", tau), "") ) + } + + return (state_order) + } + +#------------------------------------------------------------------------------- +# estimateTemporalDynamic +#------------------------------------------------------------------------------- +#' Estimation of the temporal causal discovery parameters +#' +#' @description This function estimates the number of layers and number of +#' time steps between each layer that are needed to cover the dynamic of a +#' temporal dataset when reconstructing a temporal causal graph. +#' Using autocorrelation decay, the function computes the average relaxation +#' time of the variables and, in regard of a maximum number of nodes, deduces +#' the number of layers and number of time steps between each layer to be used. +#' +#' @param input_data [a data frame] +#' A data frame containing the observational data.\cr +#' The expected data frame layout is variables as columns and +#' time series/time steps as rows. +#' The time step information must be supplied in the first column and, +#' for each time series, be consecutive and in ascending order (increment of 1). +#' Multiple trajectories can be provided, the function will consider that a +#' new trajectory starts each time a smaller time step than the one of the +#' previous row is encountered. +#' +#' @param state_order [a data frame] An optional data frame providing extra +#' information about variables. It must have d rows where d is the number of +#' input variables, excluding the time step one.\cr +#' For optional columns, if they are not provided or contain missing +#' values, default values suitable for \emph{input_data} will be used. +#' +#' The following structure (named columns) is expected:\cr +#' +#' "var_names" (required) contains the name of each variable as specified +#' by colnames(input_data), excluding the time steps column. +#' +#' "var_type" (optional) contains a binary value that specifies if each +#' variable is to be considered as discrete (0) or continuous (1). +#' Discrete variables will be excluded from the temporal dynamic estimation. +#' +#' "is_contextual" (optional) contains a binary value that specifies if a +#' variable is to be considered as a contextual variable (1) or not (0). +#' Contextual variables will be excluded from the temporal dynamic estimation. +#' +#' "movavg" (optional) contains an integer value that specifies the size of +#' the moving average window to be applied to the variable. +#' Note that if "movavg" column is present in the \emph{state_order}, +#' its values will overwrite the function parameter. +#' +#' @param movavg [an integer] Optional, NULL by default.\cr +#' When an integer>= 2 is supplied, a moving average operation is applied +#' to all the non discrete and not contextual variables. If no \emph{state_order} +#' is provided, the discrete/continuous variables are deduced from the input +#' data. If you want to apply a moving average only on specific columns, +#' consider to use a \emph{movavg} column in the \emph{state_order} parameter. +#' +#' @param max_nodes [a positive integer] The maximum number of nodes in the +#' final temporal causal graph. The more nodes allowed in the temporal +#' causal discovery, the more precise will be the discovery but at the cost +#' of longer execution time. The default is set to 50 for fast causal +#' discovery. On recent computers, values up to 200 or 300 nodes are usually +#' possible (depending on the number of trajectories and time steps in the +#' input data). +#' +#' @param verbose_level [an integer value in the range [0,2], 1 by default] +#' The level of verbosity: 0 = no display, 1 = summary display, 2 = full display. +#' +#' @return A named list with two items: +#' \itemize{ +#' \item{\emph{n_layers}: the number of layers} +#' \item{\emph{delta_t}: the number of time steps between the layers} +#' } +#' +#' @export +#------------------------------------------------------------------------------- +estimateTemporalDynamic <- function (input_data, state_order=NULL, movavg=NULL, + max_nodes=50, verbose_level=1) + { + input_data = check_input_data (input_data, "TS") + state_order = check_state_order (input_data, state_order, "TS") + state_order$n_layers = NULL + state_order$delta_t = NULL + state_order = tmiic_check_state_order_part1 (state_order) + list_ret = tmiic_check_parameters (state_order = state_order, + params = list(), + n_layers = NULL, + delta_t = NULL, + movavg = movavg, + keep_max_data = F, + max_nodes = max_nodes) + state_order = tmiic_check_state_order_part2 (list_ret$state_order) + + list_ts = tmiic_extract_trajectories (input_data) + list_ts = tmiic_movavg (list_ts, state_order$movavg, verbose_level=verbose_level) + + state_order = tmiic_estimate_dynamic (list_ts, state_order, max_nodes=max_nodes, + verbose_level=verbose_level) + n_layers = unique (state_order$n_layers[ (state_order$var_type == 1) + & (state_order$is_contextual == 0) ]) + delta_t = unique (state_order$delta_t[ (state_order$var_type == 1) + & (state_order$is_contextual == 0) ]) + return ( list ("n_layers"=n_layers, "delta_t"=delta_t)) + } + diff --git a/R/tmiic.wrapper.R b/R/tmiic.wrapper.R new file mode 100644 index 00000000..1ad90263 --- /dev/null +++ b/R/tmiic.wrapper.R @@ -0,0 +1,762 @@ +#******************************************************************************* +# Filename : tmiic.wrapper.R Creation date: 24 march 2020 +# +# Description: Data transformation of time series for miic +# +# Author : Franck SIMON +#******************************************************************************* + +#------------------------------------------------------------------------------- +# tmiic_lag_state_order +#------------------------------------------------------------------------------- +# Modify the state order into a lagged version: the lagged variables are +# completed and/or repeated with lagX to match the lagged temporal graph. +# inputs: +# - state_order: a dataframe, the state order returned by +# tmiic_check_state_order_part2 +# Returns: a dataframe: the lagged state_order +#------------------------------------------------------------------------------- +tmiic_lag_state_order <- function (state_order) + { + n_vars <- nrow (state_order) + state_order$lag <- -1 + state_order$initial_var <- -1 + # + # Put lag0 and not lagged variable first + # + state_lagged <- state_order + for (var_idx in 1:n_vars) + { + if (state_lagged [var_idx, "is_contextual"] == 0) + { + state_lagged [var_idx, "var_names"] = paste0 (state_order [var_idx, "var_names"], "_lag0") + state_lagged [var_idx, "lag"] = 0 + state_lagged [var_idx, "initial_var"] = var_idx + } + else + { + state_lagged [var_idx, "lag"] = 0 + state_lagged [var_idx, "initial_var"] = var_idx + } + } + # + # Duplicate rows for lagged variables + # + state_lagged_nrows <- nrow (state_lagged) + n_layers_back_max <- max ( (state_lagged$n_layers - 1) ) + n_layers_back_idx = 1 + for (n_layers_back_idx in 1:n_layers_back_max) + { + var_idx = 1 + for (var_idx in 1:n_vars) + { + n_layers_back_of_var <- state_lagged[var_idx, "n_layers"] - 1 + if (n_layers_back_idx <= n_layers_back_of_var) + { + state_lagged_nrows <- state_lagged_nrows + 1 + state_lagged [state_lagged_nrows,] <- state_order [var_idx,] + lag <- n_layers_back_idx * state_order[var_idx, "delta_t"] + state_lagged [state_lagged_nrows, "var_names"] <- paste0 ( + state_order [var_idx, "var_names"], "_lag", lag) + state_lagged [state_lagged_nrows, "lag"] <- lag + state_lagged [state_lagged_nrows, "initial_var"] <- var_idx + } + } + } + return (state_lagged) + } + +#------------------------------------------------------------------------------- +# tmiic_lag_other_df +#------------------------------------------------------------------------------- +# Modify the complementary df int a lagged version: the 3 column dataframes are +# transformed into a 2 columns one, in which variables are transformed into +# their lagged representation. i.e: +# - lagged_var1 - lagged_var2 - 1 becomes lagged_var1_lag1 - lagged_var2_lag0 +# - ctx_var1 - lagged_var2 - NA becomes ctx_var1 - lagged_var2_lag0 +# inputs: +# - df: the dataframe to transform in its lagged version +# - state_order: a dataframe, the state order returned by +# tmiic_check_state_order_part2 +# Returns: a dataframe: the lagged dataframe +#------------------------------------------------------------------------------- +tmiic_lag_other_df <- function (state_order, df) + { + if ( (is.null (df)) || (nrow (df) <= 0) ) + return (df) + + for (i in 1:nrow (df)) + { + orig_node_idx <- which (state_order$var_names == df[i, 1]) + if (state_order[orig_node_idx, "is_contextual"] == 0) + df[i, 1] = paste0 (df [i, 1], "_lag", df [i, 3]) + df[i, 2] = paste0 (df [i, 2], "_lag0") + } + df <- df[,-3] + return (df) + } + +#------------------------------------------------------------------------------- +# tmiic_lag_input_data +#------------------------------------------------------------------------------- +# Reorganizes the inputs in a format usable by miic: input data are lagged +# using the history to create lagged variables +# The function slices the input data according to the information supplied in +# the state_order n_layers and delta_t. +# +# The number of variables is increased and renamed on n_layers +# layers by delta_t. steps. +# i.e. with n_layers=3 and delta_t.=3 : var1, var2 => +# var1_lag0, var2_lag0, var1_lag3, var2_lag3, var1_lag6, var2_lag6. +# +# Every time step (until number of time steps - (n_layers - 1) * delta_t.) +# is converted into a sample in the lagged data. +# +# Exemple with n_layers=3 and delta_t.=3: +# +# Time step Var & value Var & value => Sample Var & value Var & value +# t-6 Var1_val(t-6) Var2_val(t-6) => i Var1_lag6_val Var2_lag6_val +# t-3 Var1_val(t-3) Var2_val(t-3) => i Var1_lag3_val Var2_lag3_val +# t Var1_val(t) Var2_val(t) => i Var1_lag0_val Var2_lag0_val +# +# t-7 Var1_val(t-7) Var2_val(t-7) => i' Var1_lag6_val Var2_lag6_val +# t-4 Var1_val(t-4) Var2_val(t-4) => i' Var1_lag3_val Var2_lag3_val +# t-1 Var1_val(t-1) Var2_val(t-1) => i' Var1_lag0_val Var2_lag0_val +# +# t-8 Var1_val(t-8) Var2_val(t-8) => i" Var1_lag6_val Var2_lag6_val +# t-5 Var1_val(t-5) Var2_val(t-5) => i" Var1_lag3_val Var2_lag3_val +# t-2 Var1_val(t-2) Var2_val(t-2) => i" Var1_lag0_val Var2_lag0_val +# +# ... ............. ............. => ...... ............. ............ +# +# until number of time steps - (n_layers - 1) * delta_t is reached. +# The same process is applied to all input time series. +# +# Note that the lagging can be different for each input variable +# if different values of n_layers or delta_t are supplied and some +# variables can be not lagged at all like contextual ones. +# +# inputs: +# - list_ts: the list of time series +# - state_order: a dataframe, the lagged state order returned by +# tmiic_lag_state_order +# - keep_max_data: boolean flag, optional, FALSE by default +# When FALSE, the rows containing NA introduced by the lagging process +# are deleted, otherwise when TRUE, the rows are kept +#------------------------------------------------------------------------------- +tmiic_lag_input_data <- function (list_ts, state_order, keep_max_data=FALSE) + { + tau_max = max(state_order$lag) + na_count = 0 + list_ret = list() + for ( ts_idx in 1:length(list_ts) ) + { + df = list_ts[[ts_idx]] + # + # Check if the df has enough rows = timsteps to be lagged + # + if (nrow (df) <= tau_max) + { + if (!keep_max_data) + { + miic_warning ("data lagging", "the trajectory ", ts_idx, " has only ", + nrow (df), " time steps and will be ignored.") + list_ret[[ts_idx]] = df[FALSE,] + next + } + miic_warning ("data lagging", "the trajectory ", ts_idx, " has only ", + nrow (df), " time steps and can not be lagged over ", tau_max, + " time steps back.") + } + # + # Lag the df + # + list_tmp = list() + for ( var_idx in 1:nrow (state_order) ) + { + if (state_order[var_idx, "lag"] == 0) + list_tmp[[var_idx]] = df[,(var_idx+1)] + else + { + max_row = nrow(df) - state_order[var_idx, "lag"] + if (max_row <= 0) + list_tmp[[var_idx]] = rep (NA, nrow(df) ) + else + list_tmp[[var_idx]] = c ( rep (NA, state_order[var_idx, "lag"]), + df [1:max_row, + state_order[var_idx, "initial_var"]+1] ) + } + } + names(list_tmp) = state_order$var_names + # df <- as.data.frame (do.call (cbind, list_tmp) ) + df <- data.frame (list_tmp) + if (!keep_max_data) + df = df [(tau_max+1):nrow(df),] + # + # Check rows with only NAs + # + rows_only_na <- ( rowSums (is.na (df)) == ncol (df) ) + df <- df [!rows_only_na, ] + na_count = na_count + sum (rows_only_na) + + list_ret[[ts_idx]] = df + } + if (na_count > 0) + miic_warning ("data lagging", "the lagged data contains ", sum(na_count), + " row(s) with only NAs. These row(s) have been removed.") + return (list_ret) + } + +#----------------------------------------------------------------------------- +# tmiic_precompute_lags_layers_and_shifts +#----------------------------------------------------------------------------- +# Utility function to precompute lags, layers and shifts of nodes in the +# lagged network +# +# params: tmiic_res [a tmiic object] The object returned by miic's +# execution in temporal mode. +# +# returns: a dataframe with lagged nodes as row name and 3 columns: +# - lags: the lag of each lagged node +# - corresp_nodes: the corresponding non lagged node +# - shifts: the shift to apply to find the next lagged node +#----------------------------------------------------------------------------- +tmiic_precompute_lags_layers_and_shifts <- function (tmiic_res) + { + list_nodes_not_lagged = tmiic_res$state_order$var_names + is_contextual = tmiic_res$state_order$is_contextual + n_nodes_not_lagged = length (list_nodes_not_lagged) + list_n_layers_back <- tmiic_res$state_order$n_layers - 1 + list_delta_t <- tmiic_res$state_order$delta_t + # + # Identify lag and layer of each node + # + list_lags <- rep(0, n_nodes_not_lagged) + list_nodes_lagged <- c() + list_corresp_nodes <- c() + i = 1 + for (node_idx in 1:n_nodes_not_lagged) + { + node_name <- list_nodes_not_lagged[[node_idx]] + list_corresp_nodes[[i]] <- node_name + + if (is_contextual[[node_idx]] == 0) + node_name <- paste0 (node_name, "_lag0") + list_nodes_lagged [[i]] <- node_name + i <- i + 1 + } + + n_layers_back_max <- max (list_n_layers_back) + for (n_layers_back_idx in 1:n_layers_back_max) + { + for (node_idx in 1:n_nodes_not_lagged) + { + n_layers_back_of_var <- list_n_layers_back[[node_idx]] + if (n_layers_back_idx <= n_layers_back_of_var) + { + node_name <- list_nodes_not_lagged[[node_idx]] + list_corresp_nodes[[i]] <- node_name + + lag <- n_layers_back_idx * list_delta_t[[node_idx]]; + node_name <- paste0 (node_name, "_lag", lag) + list_nodes_lagged [[i]] <- node_name + list_lags[[i]] <- lag + i <- i + 1 + } + } + } + # + # Precompute the index shifts from a node to its first lagged counterpart + # + n_nodes_shifts = n_nodes_not_lagged + end_reached = rep (FALSE, n_nodes_not_lagged) + + list_shifts <- c() + for (n_layers_back_idx in 1:(n_layers_back_max+1) ) + { + for (node_idx in 1:n_nodes_not_lagged) + { + n_layers_back_of_var <- list_n_layers_back[[node_idx]]; + if (n_layers_back_idx <= n_layers_back_of_var) + list_shifts <- append (list_shifts, n_nodes_shifts) + else if (!end_reached[[node_idx]]) + { + end_reached[[node_idx]] = TRUE; + list_shifts <- append (list_shifts, 0) + n_nodes_shifts <- n_nodes_shifts - 1 + } + } + } + + df_ret <- data.frame (lags=as.integer(unlist(list_lags)), + corresp_nodes=unlist(list_corresp_nodes), + shifts=as.integer(unlist(list_shifts)), + stringsAsFactors=FALSE) + rownames (df_ret) <- list_nodes_lagged + return (df_ret) + } + +#------------------------------------------------------------------------------- +# tmiic_combine_lag +#------------------------------------------------------------------------------- +# Utility function to combine lags when flattening the network. +# +# param: df, a non empty dataframe with the edges to combine +#------------------------------------------------------------------------------- +tmiic_combine_lag <- function (df) + { + # Reverse inverted edges (orient == -2) and duplicate lags of bidrectional + # temporal edges (lag != 0) + # NB: for non lag 0 edges, such cases are more than likely errors + # and will generate negative lags however showing them will allow the user + # to identify possible issues + # + for (idx in 1:nrow(df) ) + { + if (df[idx,"infOrt"] == -2) + df[idx, c("x","y","lag")] <- c (df[idx,"y"], df[idx,"x"], + -as.integer (df[idx,"lag"]) ) + + if ( (df[idx,"infOrt"] == 6) & (as.integer (df[idx,"lag"]) != 0) ) + df[nrow(df)+1, c("x","y","lag")] <- c (df[idx,"y"], df[idx,"x"], + -as.integer (df[idx,"lag"]) ) + } + # + # Combine lags from node1 < node2, lag0 and node1 >= node2 + # + list_lag1 <- unique (df[ ( (df$x < df$y) & (df$lag != 0) ),]$lag) + list_lag2 <- unique (df[ (df$lag == 0),]$lag) + list_lag3 <- unique (df[ ( (df$x >= df$y) & (df$lag != 0) ),]$lag) + + list_lag1 <- paste (unlist (list_lag1[order (list_lag1, decreasing=TRUE)]), collapse=",") + list_lag2 <- as.character (list_lag2) + list_lag3 <- paste (unlist (list_lag3[order (list_lag3)]), collapse=",") + + list_lags <- c(list_lag1[[1]], list_lag2, list_lag3[[1]]) + list_lags <- lapply (list_lags, function(z) { z[!is.na(z) & z != ""]}) + if ( (list_lag1[[1]] != "") & (list_lag3[[1]] != "") ) + list_lags <- paste (unlist (list_lags), collapse="/") + else + list_lags <- paste (unlist (list_lags), collapse=",") + + return (list_lags) + } + +#------------------------------------------------------------------------------- +# tmiic_combine_orient +#------------------------------------------------------------------------------- +# Utility function to combine edges orientations when flattening the network. +# +# params: +# - df: a dataframe with the edges to combine +# - col_name: string, the orientation column +#------------------------------------------------------------------------------- +tmiic_combine_orient <- function (df, col_name) + { + df <- df[!is.na (df[[col_name]]),] + if (nrow (df) <= 0) + return (NA) + # + # We set orientations as if node X <= node Y + # NB: we do not care of x, y, lag and proba columns as they are not used + # later in the function + # + for (idx in 1:nrow(df) ) + if ( (df[idx,"x"] > df[idx,"y"]) & (!is.na (df[idx, col_name])) ) + if (abs(df[idx, col_name]) == 2) + df[idx, col_name] <- -(df[idx, col_name]) + + col_min <- min (df[, col_name]) + col_max <- max (df[, col_name]) + + if ( (col_max == 6) | ((col_min == -2) & (col_max == 2)) ) + return (6) + else if (col_max == 2) + return (2) + else if (col_min == -2) + return (-2) + else + return (1) + } + +#----------------------------------------------------------------------------- +# tmiic_combine_probas +#----------------------------------------------------------------------------- +# Utility function to combine edge probabilities when flattening the network. +# Depending on the combined edges orientation, chooses the appropriate max, min +# or mean probabilities to compute the combined edge probabilities +# +# params: +# - df: the data frame with the edges to combine +# - comb_orient: integer, the orientation of the combined edge +#----------------------------------------------------------------------------- +tmiic_combine_probas <- function (df, comb_orient) + { + valid_probas <- grepl (';', df$proba, fixed=TRUE) + df <- df[valid_probas,] + if (nrow (df) <= 0) + return (NA) + # + # We set probas like if we have node X <= node Y + # + for ( idx in 1:nrow(df) ) + if (df[idx,"x"] > df[idx,"y"]) + { + proba_split <- strsplit (df[idx, "proba"], ';' )[[1]] + df[idx, "proba"] <- paste (proba_split[[2]], proba_split[[1]], sep=";") + } + # + # Split proba column so we can do maths on it + # + probas_split <- strsplit (df$proba, ';' ) + df_probas <- do.call(rbind, probas_split) + df_probas <- data.frame (x=as.numeric ( as.character (df_probas[,1]) ), + y=as.numeric ( as.character (df_probas[,2]) ) ) + # + # Depending on the pre-computed combined orientation, keep max/min/avg + # + if (comb_orient == 6) + return (paste (max(df_probas[,1]), max(df_probas[,2]), sep=";") ) + if (comb_orient == 2) + return (paste (min(df_probas[,1]), max(df_probas[,2]), sep=";") ) + if (comb_orient == -2) + return (paste (max(df_probas[,1]), min(df_probas[,2]), sep=";") ) + return (paste (mean(df_probas[,1]), mean(df_probas[,2]), sep=";")) + } + +#----------------------------------------------------------------------------- +# tmiic_flatten_network +#----------------------------------------------------------------------------- +# Flattten the lagged network returned by tmiic for plotting +# +# In temporal mode, the network returned by miic contains lagged nodes +# (X_lag0, X_lag1, ...). This function flatten the network depending +# of the flatten_mode parameter. +# Note that only the summary data frame is flattened and the adjacency matrix +# is reduced to non lagged nodes and filled with NA during the process +# +# params: +# - tmiic_res: a tmiic object, returned by tmiic +# +# - flatten_mode: string, optional, default value "compact". +# Possible values are "compact", "combine", "unique", "drop": +# * "compact": the default. Nodes and edges are converted into a flattened +# version preserving all the initial information. +# i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 become respectively X->Y lag=1, +# X<-Y lag=2. +# * "combine": one edge will be kept per couple of nodes. +# The info_shifted will be the highest of the summarized edges whilst +# the lag and orientation of the summarized edge will be an aggregation. +# i.e.: X_lag2->Y_lag0, X_lag0<-Y_lag1 will become X<->Y lag=1,2 with +# the info_shifted of X_lag2->Y_lag0 if info_shifted of +# X_lag2->Y_lag0 > X_lag0<-Y_lag1. +# * "unique": only the edges having the highest info_shifted for a couple +# of nodes are kept in the flattened network. If several edges between +# the sames nodes have the same info_shifted, then the edge kept is +# the one with the minimum lag. +# i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +# X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y lag=1. +# * "drop"}, only the edges having the +# highest info_shifted for a couple of nodes are kept in the flattened +# network. If several edges between the sames nodes have the same +# info_shifted, then the edge kept is the one with the minimum lag.\cr +# i.e. : X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of +# X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y. The lag information is +# "lost" after flattening +# +# Note that for all modes other than "drop", lag is a new column added +# in the dataframe. +# +# - keep_edges_on_same_node: boolean, optional, TRUE by default. +# When TRUE, the edges like X_lag0-X_lag1 are kept during flattening +# (it becomes an X-X edge). When FALSE, only edges having different nodes +# are kept in the flatten network. +# +# returns: a tmiic object. The returned tmiic object is the one received +# as input where the summary dataframe has been flattened and the adjacency +# matrix reduced to the non lagged nodes +#----------------------------------------------------------------------------- +tmiic_flatten_network <- function (tmiic_res, flatten_mode="compact", + keep_edges_on_same_node=TRUE) + { + # Reduce size of adj_matrix to non lagged nodes + # (we don't care about content as it is not used for plotting) + # + list_nodes <- tmiic_res$state_order$var_names + tmiic_res$adj_matrix <- matrix(NA, nrow=0, ncol=length (list_nodes)) + colnames(tmiic_res$adj_matrix) <- list_nodes + # + # Keep only edges found by miic + # + df_edges <- tmiic_res$all.edges.summary[tmiic_res$all.edges.summary$type %in% c('P', 'TP', 'FP'), ] + if (nrow(df_edges) <= 0) + { + if (flatten_mode != "drop") + df_edges$lag = numeric(0) + tmiic_res$all.edges.summary <- df_edges + return (tmiic_res) + } + # + # Precompute lag and layer of each node + # + df_precomputed <- tmiic_precompute_lags_layers_and_shifts (tmiic_res) + # + # First step, perform flatten_mode="compact": + # from summary, remove lag info from nodes names and put it into a lag column + # + df_edges$lag <- -1 + for ( edge_idx in 1:nrow(df_edges) ) + { + one_edge <- df_edges[edge_idx,] + lag_x <- df_precomputed [[one_edge$x, "lags"]] + node_x <- df_precomputed [[one_edge$x, "corresp_nodes"]] + lag_y <- df_precomputed [[one_edge$y, "lags"]] + node_y <- df_precomputed [[one_edge$y, "corresp_nodes"]] + # + # Ensure the lag is > 0 (=> we put nodes as x=oldest to y=newest) + # + lag = lag_x - lag_y + if (lag >= 0) + df_edges [edge_idx, c("x","y","lag")] <- c(node_x, node_y, lag) + else + { + df_edges [edge_idx, c("x","y","lag")] <- c(node_y, node_x, -lag) + if (abs (one_edge$infOrt) == 2) + df_edges [edge_idx,"infOrt"] <- -one_edge$infOrt + if ( !is.na (one_edge$trueOrt ) ) + if (abs (one_edge$trueOrt ) == 2) + df_edges [edge_idx,"trueOrt"] <- -one_edge$trueOrt + if ( !is.na (one_edge$proba ) ) + { + df_edges [edge_idx, "proba"] = paste0 (rev ( + strsplit (df_edges [edge_idx, "proba"], ";")[[1]]), collapse=";") + } + } + } + df_edges <- transform ( df_edges, lag = as.integer (lag) ) + # + # Exclude self loops if requested + # + if (!keep_edges_on_same_node) + df_edges <- df_edges[df_edges$x != df_edges$y, ] + if (nrow(df_edges) <= 0) + { + if (flatten_mode == "drop") + df_edges$lag <- NULL + tmiic_res$all.edges.summary <- df_edges + return (tmiic_res) + } + # + # "compact" mode is done + # + if (flatten_mode != "compact") + { + # if mode != "compact", we want only one edge per couple of nodes: + # the edges kept per couple of nodes will be the one having the max + # info_shifted and if several edges have the same info_shifted, + # the one with the minimum lag. + # + # Identify the couples of same X-Y or Y-X whatever the lag or orientation + # + df_xy <- df_edges[,c("x", "y")] + list_rows_to_swap <- (df_xy$x > df_xy$y) + df_xy [list_rows_to_swap, c("x","y")] <- df_xy [list_rows_to_swap, c("y","x")] + df_xy <- unique (df_xy) + # + # Keep one edge per couple of nodes + # + df_group <- df_edges[FALSE,] + for ( xy_idx in 1:nrow(df_xy) ) + { + ref_x <- df_xy[xy_idx,"x"] + ref_y <- df_xy[xy_idx,"y"] + cond_same_edges = ( ( (df_edges[["x"]] == ref_x) & (df_edges[["y"]] == ref_y) ) + | ( (df_edges[["x"]] == ref_y) & (df_edges[["y"]] == ref_x) ) ) + df_same <- df_edges[cond_same_edges,] + + if (nrow (df_same) > 1) + { + if (flatten_mode == "combine") + { + # Combine lag, orient and proba + # + df_same$new_lag <- tmiic_combine_lag (df_same) + comb_infOrt <- tmiic_combine_orient (df_same, "infOrt") + df_same$proba <- tmiic_combine_probas (df_same, comb_infOrt) + df_same$trueOrt <- tmiic_combine_orient (df_same, "trueOrt") + df_same$infOrt <- comb_infOrt + # + # Orientations and probas have been computed for x <= y, + # so force x <= y on all rows + # + if (ref_x <= ref_y) + { + df_same[,"x"] <- ref_x + df_same[,"y"] <- ref_y + } + else + { + df_same[, "x"] <- ref_y + df_same[, "y"] <- ref_x + } + } + + max_info <- max (df_same[["info_shifted"]]) + df_same <- df_same[ (df_same[["info_shifted"]] == max_info),] + } + if (nrow(df_same) > 1) + { + min_lag <- min (df_same[["lag"]]) + df_same <- df_same[ (df_same[["lag"]] == min_lag),] + } + if ("new_lag" %in% colnames(df_same) ) + { + df_same$lag <- df_same$new_lag + df_same$new_lag <- NULL + } + df_group <- rbind (df_group, df_same) + } + df_edges <- df_group + } + # + # Remove lag info when not wanted + # + if (flatten_mode == "drop") + # + # We do not want to keep info about lag at all + # + df_edges$lag <- NULL + else + { + # For contextual variable, we clean the lag info + # + is_contextual <- tmiic_res$state_order$is_contextual + if (!is.null(is_contextual)) + { + list_nodes_not_lagged = tmiic_res$state_order$var_names + for ( edge_idx in 1:nrow(df_edges) ) + { + one_edge <- df_edges[edge_idx,] + x_idx <- which (list_nodes_not_lagged == one_edge$x) + y_idx <- which (list_nodes_not_lagged == one_edge$y) + if (is_contextual[[x_idx]] | is_contextual[[y_idx]]) + df_edges[edge_idx, "lag"] <- "" + } + } + } + # + # returns the tmiic structure where network summary has been flattened + # + tmiic_res$all.edges.summary <- df_edges + return (tmiic_res) + } + +#----------------------------------------------------------------------------- +# tmiic_repeat_edges_over_history +#----------------------------------------------------------------------------- +# Duplicates edges found by miic over the history assuming stationarity +# +# In temporal mode, the network returned by miic contains only edges +# with at least one contemporaneous node (lag0). This function duplicates +# the edges over the history. +# i.e: assuming that we used nlayers=4 and delta_t=1, the edge X_lag0-X_lag1 +# will be copied as X_lag1-X_lag2 and X_lag2-X_lag3. +# +# param: tmiic_res, the object returned by tmiic +# +# returns: a dataframe with edges completed by stationarity +#----------------------------------------------------------------------------- +tmiic_repeat_edges_over_history <- function (tmiic_res) + { + # Consider only edges found by miic type = "P", "TP", "FP" + # + df_edges <- tmiic_res$all.edges.summary[tmiic_res$all.edges.summary$type %in% c('P', 'TP', 'FP'), ] + if (nrow(df_edges) <= 0) + return (df_edges) + # + # Precompute lag, layer and shift of each node + # + df_precomp <- tmiic_precompute_lags_layers_and_shifts (tmiic_res) + list_n_layers_back <- tmiic_res$state_order$n_layers - 1 + list_nodes_not_lagged <- tmiic_res$state_order$var_names + # + # Duplicate the edges over all layers of history + # + n_edges <- nrow(df_edges) + for (edge_idx in 1:n_edges) + { + node_x <- df_edges[edge_idx,"x"] + node_y <- df_edges[edge_idx,"y"] + node_x_pos = which (rownames (df_precomp) == node_x) + node_y_pos = which (rownames (df_precomp) == node_y) + # + # If one of the variable is not lagged, the lag is not constant + # + sav_lag = df_precomp [node_x_pos, "lags"] - df_precomp [node_y_pos, "lags"] + node_x_base <- df_precomp [node_x_pos, "corresp_nodes"] + node_y_base <- df_precomp [node_y_pos, "corresp_nodes"] + n_layers_back_x <- list_n_layers_back [[which (list_nodes_not_lagged == node_x_base)]] + n_layers_back_y <- list_n_layers_back [[which (list_nodes_not_lagged == node_y_base)]] + same_lag_needed = TRUE + if (n_layers_back_x <= 0) + same_lag_needed = FALSE + if (n_layers_back_y <= 0) + same_lag_needed = FALSE + # + # Duplication of the edge + # + while (TRUE) + { + # We shift the nodes positions using pre-computed nodes shifts + # + node_x_shift = df_precomp [node_x_pos, "shifts"] + node_y_shift = df_precomp [node_y_pos, "shifts"] + if ( (node_x_shift <= 0) & (node_y_shift <= 0) ) + break + node_x_pos = node_x_pos + node_x_shift + node_y_pos = node_y_pos + node_y_shift + # + # Ensure if both variable are lagged than we keep the same lag when duplicating + # + same_lag_impossible = FALSE + if (same_lag_needed) + { + new_lag = df_precomp [node_x_pos, "lags"] - df_precomp [node_y_pos, "lags"] + while (sav_lag != new_lag) + { + if (sav_lag < new_lag) + { + node_y_shift = df_precomp [node_y_pos, "shifts"] + if (node_y_shift <= 0) + { + same_lag_impossible = TRUE + break + } + node_y_pos = node_y_pos + node_y_shift; + } + else # sav_lag > new_lag + { + node_x_shift = df_precomp [node_x_pos, "shifts"] + if (node_x_shift <= 0) + { + same_lag_impossible = TRUE + break + } + node_x_pos = node_x_pos + node_x_shift; + } + new_lag = df_precomp [node_x_pos, "lags"] - df_precomp [node_y_pos, "lags"] + } + } + if (same_lag_impossible) + break + # + # Add the duplicated edge + # + df_edges [ nrow(df_edges)+1, ] <- df_edges [ edge_idx, ] + df_edges [nrow(df_edges),"x"] <- rownames(df_precomp)[[node_x_pos]] + df_edges [nrow(df_edges),"y"] <- rownames(df_precomp)[[node_y_pos]] + } + } + return (df_edges) + } + diff --git a/README.md b/README.md index d739c2eb..8f644619 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,15 @@ status](https://github.com/miicTeam/miic_R_package/workflows/R-CMD-check/badge.svg)](https://github.com/miicTeam/miic_R_package/actions) -This repository contains the source code for MIIC (**M**ultivariate **I**nformation based **I**nductive **C**ausation), a method based on constraint-based approaches that learns a large class of causal or non-causal graphical models from purely observational data while including the effects of unobserved latent variables. Starting from a complete graph, the method iteratively removes dispensable edges, by uncovering significant information contributions from indirect paths, and assesses edge-specific confidences from randomization of available data. The remaining edges are then oriented based on the signature of causality in observational data. This approach can be applied on a wide range of datasets and provide new biological insights on regulatory networks from single cell expression data, genomic alterations during tumor development and co-evolving residues in protein structures. For more information you can refer to Cabeli et al. PLoS Comp. Bio. 2020, Verny et al. PLoS Comp. Bio. 2017. +This repository contains the source code for MIIC (**M**ultivariate **I**nformation based **I**nductive **C**ausation), a method based on constraint-based approaches that learns a large class of causal or non-causal graphical models from purely observational data while including the effects of unobserved latent variables. Starting from a complete graph, the method iteratively removes dispensable edges, by uncovering significant information contributions from indirect paths, and assesses edge-specific confidences from randomization of available data. The remaining edges are then oriented based on the signature of causality in observational data. This approach can be applied on a wide range of datasets and provide new biological insights on regulatory networks from single cell expression data, genomic alterations during tumor development and co-evolving residues in protein structures. Since the version 2.0, MIIC can in addition process +stationary time series to unveil temporal causal graphs. ## References -Cabeli V., Li H., Ribeiro-Dantas M., Simon F., Isambert H.; [Reliable causal discovery based on mutual information supremum principle for finite dataset; -Why21 at NeurIPS, 2021](https://why21.causalai.net/papers/WHY21_24.pdf). +Simon F., Comes M. C., Tocci T., Dupuis L., Cabeli V., Lagrange N., Mencattini A., Parrini M. C., Martinelli E., Isambert H.; [CausalXtract: a flexible pipeline to extract causal effects from live-cell time-lapse imaging data; eLife, reviewed preprint](https://www.biorxiv.org/content/10.1101/2024.02.06.579177v1.abstract). + +Ribeiro-Dantas M. D. C., Li H., Cabeli V., Dupuis L., Simon F., Hettal L., Hamy A. S., Isambert H.; [Learning interpretable causal networks from very large datasets, application to 400,000 medical records of breast cancer patients; iScience, 2024](https://arxiv.org/abs/2303.06423). + +Cabeli V., Li H., Ribeiro-Dantas M., Simon F., Isambert H.; [Reliable causal discovery based on mutual information supremum principle for finite dataset; Why21 at NeurIPS, 2021](https://why21.causalai.net/papers/WHY21_24.pdf). Cabeli V., Verny L., Sella N., Uguzzoni G., Verny M., Isambert H.; Learning clinical networks from medical records based on information estimates in mixed-type data; PLoS computational biology., 2020. [doi:10.1371/journal.pcbi.1007866](https://doi.org/10.1371/journal.pcbi.1007866) | [code](https://github.com/vcabeli/miic_PLoS) @@ -60,6 +64,11 @@ if(require(igraph)) { You can find the documentation pages in the "man" folder, in the auto generated [PDF](https://cran.r-project.org/web/packages/miic/miic.pdf), or use R functions `help()` and `?`. ## Authors +- Tiziana Tocci +- Nikita Lagrange +- Orianne Debeaupuis +- Louise Dupuis +- Franck Simon - Vincent Cabeli - Honghao Li - Marcel Ribeiro Dantas diff --git a/data/covidCases.rda b/data/covidCases.rda new file mode 100644 index 0000000000000000000000000000000000000000..7e5195121881011c92d5a9f7398f401146b94178 GIT binary patch literal 1977 zcmV;q2S)fGiwFP!000002F=)ckQ`+h$MJdJ?&;Z?*_qjy>}EF@uAq3w2;O+0K@d=k z0W^YQ)+A^&mq`HejJM*2w=7-x0Y9_sO7Dw`$tdwWfV*!}fDGZ(KdQYtyb; ztytoh3LU(4{;qh^g?J40(e(z#Fy8{&7#}^?MGq6`V}NLP5>r@;Y0O|54#Ewv90%hN z9E!tmI9A{Y+z>ayk+?B#f}7%IxH)ctTVf?{g;lsUj>6Hn4Q`9u;r6%#?ua|#7~C0m z!Ci4T9E-c-9$1aza8DeM6L2q_h?8(K?v4B46xsvlkj9b1y9A(@N_%_&&0EE0iKQL;JJ7no{tOh0=y6}!i(_|yc93P%W)B2 zfmh;Uyb7=R1cJ~T%bK0h%9xZo&BBb@*5`%p)e6Y3$*RTWeF-dO|=AhyEp7`i~%wCL67{|n4 zS%g@v#7N71Q?%F8XUn~bE?Ujun8u=QdTxmMie9VV!8b`?GDdBe+N;E0X*^lX6#OHJ z?R>1?*62rUWzS{0P4DvlU$g(L^H17eZ7)9Z(Zg>MF_3=ioOdNUOJYBTRk+Z-XzUk?xb&PoJ+{ zx0>B|3yl@~9Ov4<^_v&nw^)z4_TO#sT;!1O`^XvL!;u@p_ruS_XK6OYdBl5ow7CyY zNb*TR*ZDp8p=M~l{Bv9}iv7q<-x@xjE*Qo_=rKjkq|bEL zQ2&{iq|kNbqrc&!j^&6XSLMio|1Zt%yDj$L|J4??OXUBk6C(FUK8}2D9&5j^&HF-D zQzZF1eciFVU&LI{bNM54{7wEqF9j_&`B}?-%|H8OJfKYzy-OU^!G!!gDgLMB_GR=v z^?tgf?MeA|l4fT-J8iC~&DErF^z_v;cY}98ozge&gHLJa2Zk&*+WPMP(>aawv_0@# z&-plSZC^a})2{wHwA7*R_AhAktA01G4o_~mzs+Bp-n+@~D%u~TgE2AGSOYq3XuGby zDD_+N&$@hE=izm=R(Z@QK7Z+)RtlOgehm4k(0A5n`h;Uz%NGs3Ija}4ch6!usoj#= zAn7b>}HY`r~S zYJP@x_+iiW{r&9H<&3~1vJb^qaN&=un22)=w6UtDHRmeTT}Vqp1S)CX$0|Ly?RC8f4^SJG-XW`TbJVjuf4<&0a^$h6jIc@7 z)*Oi*lbR?+U$MUM&1&Bd&F>k`<@G_c`^)ud(Y{=Cejr=9=)4&98Fg}JbUoIVQPVEe zw^8e^9v#P;F4k7DCOKzR@0q9h8cS}=)(DwqvK(Kwhu%}rbDh5HZ=<2`NA+&=PoVu) zXi&{M@iV9!`uw@8ooV{%eh+g0l)7-@?=&oyR``ALvSDpIX?@h!_ked#@#85vTk4sC zoH@Z;`@F1AcT;l35;>&Lrzd!Nk3aWlefmPaZ#_DQ{2*ua&s_AMh5EMH zt?^HV`F{jdqaVjk+w8~Pn|6%OtK)ahZriYFbk;aw)6T84+sXYVwSMYlJ1Za;73!T>{Qd5=GHb91pm-y8f3 LMI?C)I}rc?s4h1J literal 0 HcmV?d00001 diff --git a/data/datalist b/data/datalist index 285e0e3a..8a94a1ec 100755 --- a/data/datalist +++ b/data/datalist @@ -1,3 +1,4 @@ hematoData cosmicCancer ohno +covidCases diff --git a/man/covidCases.Rd b/man/covidCases.Rd new file mode 100644 index 00000000..56200cfe --- /dev/null +++ b/man/covidCases.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{covidCases} +\alias{covidCases} +\title{Covid cases} +\format{ +A data.frame object. +} +\usage{ +data(covidCases) +} +\description{ +Demo dataset of chronological series to be used in temporal mode of miic. +Evolution of Covid cases on a subset of EU countries from 12/31/2019 to 06/18/2020. +Source of the data : European Centre for Disease Prevention and Control. +} +\references{ +ECDC (\href{https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide}{ECDC link}) +} +\keyword{data} +\keyword{datasets} diff --git a/man/discretizeMutual.Rd b/man/discretizeMutual.Rd index f9165ffa..871f5e92 100644 --- a/man/discretizeMutual.Rd +++ b/man/discretizeMutual.Rd @@ -55,16 +55,18 @@ ggplot2 and gridExtra).} \value{ A list that contains : \itemize{ -\item{two vectors containing the cutpoints for each variable : \emph{cutpoints1} corresponds to /emph{myDist1}, /emph{cutpoints2} corresponds to /emph{myDist2}.} +\item{two vectors containing the cutpoints for each variable : +\emph{cutpoints1} corresponds to \emph{X}, +\emph{cutpoints2} corresponds to \emph{Y}.} \item{\emph{niterations} is the number of iterations performed before convergence of the (C)MI estimation.} \item{\emph{iterationN}, lists contatining the cutpoint vectors for each iteration.} \item{\emph{info} and \emph{infok}, the estimated (C)MI value and (C)MI minus the complexity cost.} -\item{if $emph{plot} == TRUE, a plot object (requires ggplot2 and gridExtra).} +\item{if \emph{plot} == TRUE, a plot object (requires ggplot2 and gridExtra).} } } \description{ This function chooses cutpoints in the input distributions by maximizing the mutual -information minus a complexity cost (computed as BIC or with the Normalized Maximum Likelihood ). The +information minus a complexity cost (computed as BIC or with the Normalized Maximum Likelihood). The (conditional) mutual information computed on the optimized discretized distributions effectively approaches the mutual information computed on the original continuous variables. } diff --git a/man/estimateTemporalDynamic.Rd b/man/estimateTemporalDynamic.Rd new file mode 100644 index 00000000..e6d6b52b --- /dev/null +++ b/man/estimateTemporalDynamic.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tmiic.utils.R +\name{estimateTemporalDynamic} +\alias{estimateTemporalDynamic} +\title{Estimation of the temporal causal discovery parameters} +\usage{ +estimateTemporalDynamic( + input_data, + state_order = NULL, + movavg = NULL, + max_nodes = 50, + verbose_level = 1 +) +} +\arguments{ +\item{input_data}{[a data frame] +A data frame containing the observational data.\cr +The expected data frame layout is variables as columns and +time series/time steps as rows. +The time step information must be supplied in the first column and, +for each time series, be consecutive and in ascending order (increment of 1). +Multiple trajectories can be provided, the function will consider that a +new trajectory starts each time a smaller time step than the one of the +previous row is encountered.} + +\item{state_order}{[a data frame] An optional data frame providing extra +information about variables. It must have d rows where d is the number of +input variables, excluding the time step one.\cr +For optional columns, if they are not provided or contain missing +values, default values suitable for \emph{input_data} will be used. + +The following structure (named columns) is expected:\cr + +"var_names" (required) contains the name of each variable as specified +by colnames(input_data), excluding the time steps column. + +"var_type" (optional) contains a binary value that specifies if each +variable is to be considered as discrete (0) or continuous (1). +Discrete variables will be excluded from the temporal dynamic estimation. + +"is_contextual" (optional) contains a binary value that specifies if a +variable is to be considered as a contextual variable (1) or not (0). +Contextual variables will be excluded from the temporal dynamic estimation. + +"movavg" (optional) contains an integer value that specifies the size of +the moving average window to be applied to the variable. +Note that if "movavg" column is present in the \emph{state_order}, +its values will overwrite the function parameter.} + +\item{movavg}{[an integer] Optional, NULL by default.\cr +When an integer>= 2 is supplied, a moving average operation is applied +to all the non discrete and not contextual variables. If no \emph{state_order} +is provided, the discrete/continuous variables are deduced from the input +data. If you want to apply a moving average only on specific columns, +consider to use a \emph{movavg} column in the \emph{state_order} parameter.} + +\item{max_nodes}{[a positive integer] The maximum number of nodes in the +final temporal causal graph. The more nodes allowed in the temporal +causal discovery, the more precise will be the discovery but at the cost +of longer execution time. The default is set to 50 for fast causal +discovery. On recent computers, values up to 200 or 300 nodes are usually +possible (depending on the number of trajectories and time steps in the +input data).} + +\item{verbose_level}{[an integer value in the range [0,2], 1 by default] +The level of verbosity: 0 = no display, 1 = summary display, 2 = full display.} +} +\value{ +A named list with two items: +\itemize{ + \item{\emph{n_layers}: the number of layers} + \item{\emph{delta_t}: the number of time steps between the layers} +} +} +\description{ +This function estimates the number of layers and number of +time steps between each layer that are needed to cover the dynamic of a +temporal dataset when reconstructing a temporal causal graph. +Using autocorrelation decay, the function computes the average relaxation +time of the variables and, in regard of a maximum number of nodes, deduces +the number of layers and number of time steps between each layer to be used. +} diff --git a/man/miic.Rd b/man/miic.Rd index 5e34307b..633bff3b 100644 --- a/man/miic.Rd +++ b/man/miic.Rd @@ -25,275 +25,462 @@ miic( max_iteration = 100, consensus_threshold = 0.8, negative_info = FALSE, + mode = "S", + n_layers = NULL, + delta_t = NULL, + movavg = NULL, + keep_max_data = FALSE, + max_nodes = 50, verbose = FALSE ) } \arguments{ -\item{input_data}{[a data frame] +\item{input_data}{[a data frame, required] + A n*d data frame (n samples, d variables) that contains the observational data. -Each column corresponds to one variable and each row is a sample that gives the -values for all the observed variables. The column names correspond to the -names of the observed variables. Numeric columns will be treated as continuous -values, factors and character as categorical.} -\item{state_order}{[a data frame] An optional data frame providing extra -information for variables. It must have d rows where d is the number of input -variables, and the following structure (named columns): +In standard mode, each column corresponds to one variable and each row is a +sample that gives the values for all the observed variables. +The column names correspond to the names of the observed variables. +Numeric columns with at least 5 distinct values will be treated as continuous +by default whilst numeric columns with less than 5 distinct values, factors +and characters will be considered as categorical. + +In temporal mode, the expected data frame layout is variables as columns +and time series/time steps as rows. +The time step information must be supplied in the first column and, +for each time series, be consecutive and in ascending order (increment of 1). +Multiple trajectories can be provided, miic will consider that a new trajectory +starts each time a smaller time step than the one of the previous row is encountered.} + +\item{state_order}{[a data frame, optional, NULL by default] + +A data frame providing extra information for variables. It must have d rows +where d is the number of input variables and possible columns are described +below. For optional columns, if they are not provided or contain missing +values, default values suitable for \emph{input_data} will be used. -"var_names" (required) contains the name of each variable as specified -by colnames(input_data). +\emph{"var_names"} (required) contains the name of each variable as specified +by colnames(input_data). In temporal mode, the time steps column should +not be mentioned in the variables list. -"var_type" (optional) contains a binary value that specifies if each +\emph{"var_type"} (optional) contains a binary value that specifies if each variable is to be considered as discrete (0) or continuous (1). -"levels_increasing_order" (optional) contains a single character string +\emph{"levels_increasing_order"} (optional) contains a single character string with all of the unique levels of the ordinal variable in increasing order, delimited by comma ','. It will be used during the post-processing to compute the sign of an edge using Spearman's rank correlation. If a variable is continuous or is categorical but not ordinal, this column should be NA. -"is_contextual" (optional) contains a binary value that specifies if a +\emph{"is_contextual"} (optional) contains a binary value that specifies if a variable is to be considered as a contextual variable (1) or not (0). Contextual variables cannot be the child node of any other variable (cannot have edge with arrowhead pointing to them). -"is_consequence" (optional) contains a binary value that specifies if a +\emph{"is_consequence"} (optional) contains a binary value that specifies if a variable is to be considered as a consequence variable (1) or not (0). -Consequence variables cannot be the parent node of any other variable -and cannot be used as contributors.} +Edges between consequence variables are ignored, consequence variables +cannot be the parent node of any other variable and cannot be used as +contributors. Edges between a non consequence and consequence variables +are pre-oriented toward the consequence. + +Several other columns are possible in temporal mode: + +\emph{"n_layers"} (optional) contains an integer value that specifies the +number of layers to be considered for the variable. +Note that if a \emph{"n_layers"} column is present in the \emph{state_order}, +its values will overwrite the function parameter. + +\emph{"delta_t"} (optional) contains an integer value that specifies the number +of time steps between each layer for the variable. +Note that if a \emph{"delta_t"} column is present in the \emph{state_order}, +its values will overwrite the function parameter. + +\emph{"movavg"} (optional) contains an integer value that specifies the size of +the moving average window to be applied to the variable. +Note that if \emph{"movavg"} column is present in the \emph{state_order}, +its values will overwrite the function parameter.} + +\item{true_edges}{[a data frame, optional, NULL by default] + +A data frame containing the edges of the true graph for +computing performance after the run.\cr +In standard mode, the expected layout is a two columns data frame, each row +representing a true edge with in each column, the variable names. +Variables names must exist in the \emph{input_data} data frame.\cr +In temporal mode, the expected layout is a three columns data frame, +with the first two columns being variable names and the third the lag. +Variables names must exist in the \emph{input_data} data frame and the lag +must be valid in the time unfolded graph. e.g. a row var1, var2, 3 is valid +with \emph{n_layers} = 4 + \emph{delta_t} = 1 or +\emph{n_layers} = 2 + \emph{delta_t} = 3 +but not for \emph{n_layers} = 2 + \emph{delta_t} = 2 as there is no matching +edge in the time unfolded graph. Please note that the order is important: +var1, var2, 3 is interpreted as var1_lag3 - var2_lag0. Please note also that, +for contextual variables that are not lagged, the expected value in the +third column for the time lag is NA.} + +\item{black_box}{[a data frame, optional, NULL by default] + +A data frame containing pairs of variables that will be considered +as independent during the network reconstruction. In practice, these edges +will not be included in the skeleton initialization and cannot be part of +the final result.\cr +In standard mode, the expected layout is a two columns data frame, each row +representing a forbidden edge with in each column, the variable names. +Variables names must exist in the \emph{input_data} data frame.\cr +In temporal mode, the expected layout is a three columns data frame, +with the first two columns being variable names and the third the lag. +Variables names must exist in the \emph{input_data} data frame and the lag +must be valid in the time unfolded graph. e.g. a row var1, var2, 3 is valid +with \emph{n_layers} = 4 + \emph{delta_t} = 1 or +\emph{n_layers} = 2 + \emph{delta_t} = 3 +but not for \emph{n_layers} = 2 + \emph{delta_t} = 2 as there is no matching +edge in the time unfolded graph. Please note that the order is important: +var1, var2, 3 is interpreted as var1_lag3 - var2_lag0. Please note also that, +for contextual variables that are not lagged, the expected value in the +third column for the time lag is NA.} + +\item{n_threads}{[a positive integer, optional, 1 by default] -\item{true_edges}{[a data frame] -An optional E*2 data frame containing the E edges of the true graph for -computing performance after the run.} - -\item{black_box}{[a data frame] -An optional E*2 data frame containing E pairs of variables that will be considered -as independent during the network reconstruction. In practice, these edges will not -be included in the skeleton initialization and cannot be part of the final result. -Variable names must correspond to the \emph{input_data} data frame.} - -\item{n_threads}{[a positive integer] When set greater than 1, n_threads parallel threads will be used for computation. Make sure your compiler is compatible with openmp if you wish to use multithreading.} -\item{cplx}{[a string; \emph{c("nml", "mdl")}] -In practice, the finite size of the input -dataset requires that the 2-point and 3-point information measures should be -\emph{shifted} by a \emph{complexity} term. The finite size corrections can be -based on the Minimal Description Length (MDL) criterion (set the option with "mdl"). -In practice, the MDL complexity criterion tends to underestimate the relevance of -edges connecting variables with many different categories, leading to the removal of -false negative edges. To avoid such biases with finite datasets, the (universal) -Normalized Maximum Likelihood (NML) criterion can be used (set the option with "nml"). -The default is "nml" (see Affeldt \emph{et al.}, UAI 2015).} - -\item{orientation}{[a boolean value] -The miic network skeleton can be partially directed -by orienting and propagating edge directions, based on the sign and magnitude -of the conditional 3-point information of unshielded triples. The propagation -procedure relyes on probabilities; for more details, see Verny \emph{et al.}, PLoS Comp. Bio. 2017). -If set to FALSE the orientation step is not performed.} - -\item{ori_proba_ratio}{[a floating point between 0 and 1] The threshold when -deducing the type of an edge tip (head/tail) from the probability of -orientation. For a given edge tip, denote by p the probability of it being a -head, the orientation is accepted if (1 - p) / p < ori_proba_ratio. 0 means -reject all orientations, 1 means accept all orientations.} - -\item{ori_consensus_ratio}{[a floating point between 0 and 1] The threshold -when deducing the type of an consensus edge tip (head/tail) from the average -probability of orientation. For a given edge tip, denote by p the probability -of it being a head, the orientation is accepted if (1 - p) / p < -ori_consensus_ratio. 0 means reject all orientations, 1 means accept all -orientations.} - -\item{propagation}{[a boolean value] +\item{cplx}{[a string, optional, "nml" by default, possible values: +"nml", "mdl"] + +In practice, the finite size of the input dataset requires that +the 2-point and 3-point information measures should be \emph{shifted} +by a \emph{complexity} term. The finite size corrections can be based on +the Minimal Description Length (MDL) criterion. +However, the MDL complexity criterion tends to underestimate the +relevance of edges connecting variables with many different categories, +leading to the removal of false negative edges. To avoid such biases +with finite datasets, the (universal) Normalized Maximum Likelihood (NML) +criterion can be used (see Affeldt \emph{et al.}, UAI 2015).} + +\item{orientation}{[a boolean value, optional, TRUE by default] + +The miic network skeleton can be partially directed by orienting +edge directions, based on the sign and magnitude of the conditional +3-point information of unshielded triples and, in temporal mode, using time. +If set to FALSE, the orientation step is not performed.} + +\item{ori_proba_ratio}{[a floating point between 0 and 1, optional, +1 by default] + +The threshold when deducing the type of an edge tip (head/tail) +from the probability of orientation. +For a given edge tip, denote by p the probability of it being a head, +the orientation is accepted if (1 - p) / p < \emph{ori_proba_ratio}. +0 means reject all orientations, 1 means accept all orientations.} + +\item{ori_consensus_ratio}{[a floating point between 0 and 1, optional, +NULL by default] + +The threshold when deducing the type of an consensus edge tip (head/tail) +from the average probability of orientation. +For a given edge tip, denote by p the probability of it being a head, +the orientation is accepted if (1 - p) / p < \emph{ori_consensus_ratio}. +0 means reject all orientations, 1 means accept all orientations. +If not supplied, the \emph{ori_consensus_ratio} will be initialized with +the \emph{ori_proba_ratio} value.} + +\item{propagation}{[a boolean value, optional, FALSE by default] + If set to FALSE, the skeleton is partially oriented with only the v-structure orientations. Otherwise, the v-structure orientations are propagated to downstream undirected edges in unshielded triples following -the orientation method} - -\item{latent}{[a string; \emph{c("orientation", "no", "yes")}] -When set to "yes", the network reconstruction is taking into account hidden (latent) -variables. When set to "orientation", latent variables are not considered during the skeleton -reconstruction but allows bi-directed edges during the orientation. Dependence -between two observed variables due to a latent variable is indicated with a '6' in -the adjacency matrix and in the network edges.summary and by a bi-directed edge -in the (partially) oriented graph.} - -\item{n_eff}{[a positive integer] -The n samples given in the \emph{input_data} data frame are expected -to be independent. In case of correlated samples such as in time series or +the propagation procedure, relying on probabilities (for more details, +see Verny \emph{et al.}, PLoS Comp. Bio. 2017).} + +\item{latent}{[a string, optional, "orientation" by default, possible +values: "orientation", "no", "yes"] + +When set to "yes", the network reconstruction is taking into account hidden +(latent) variables. When set to "orientation", latent variables are not +considered during the skeleton reconstruction but allows bi-directed edges +during the orientation. +Dependence between two observed variables due to a latent variable is +indicated with a '6' in the adjacency matrix and in the network +edges.summary and by a bi-directed edge in the (partially) oriented graph.} + +\item{n_eff}{[a positive integer, optional, -1 by default] + +In standard mode, the n samples given in the \emph{input_data} data frame are +expected to be independent. In case of correlated samples such as in Monte Carlo sampling approaches, the effective number of independent samples \emph{n_eff} can be estimated using the decay of the autocorrelation function -(Verny \emph{et al.}, PLoS Comp. Bio. 2017). This \emph{effective} number \emph{n_eff} -of \emph{independent} samples can be provided using this parameter.} - -\item{n_shuffles}{[a positive integer] The number of shufflings of -the original dataset in order to evaluate the edge specific confidence -ratio of all inferred edges. Default is 0: no confidence cut. If the -number of shufflings is set to an integer > 0, the confidence threshold -must also be > 0 (i.e: n_shuffles=100 and conf_threshold=0.01).} - -\item{conf_threshold}{[a positive floating point] The threshold used -to filter the less probable edges following the skeleton step. See Verny -\emph{et al.}, PLoS Comp. Bio. 2017. Default is 0: no confidence cut. If the -the confidence threshold is set > 0, the number of shufflings must also -be defined > 0 (i.e: n_shuffles=100 and conf_threshold=0.01).} - -\item{sample_weights}{[a numeric vector] -An optional vector containing the weight of each observation. NULL by default. -If defined, it must be a vector of floats in the range [0,1] of size equal -to the number of samples.} - -\item{test_mar}{[a boolean value] -If set to TRUE, distributions with missing values will be tested with Kullback-Leibler -divergence : conditioning variables for the given link \eqn{X\rightarrow Y}\eqn{Z} will be -considered only if the divergence between the full distribution and the non-missing -distribution \eqn{KL(P(X,Y) | P(X,Y)_{!NA})} is low enough (with \eqn{P(X,Y)_{!NA}} as -the joint distribution of \eqn{X} and \eqn{Y} on samples which are not missing on Z. +(Verny et al., PLoS Comp. Bio. 2017). This effective number \emph{n_eff} +of independent samples can be provided using this parameter.} + +\item{n_shuffles}{[a positive integer, optional, 0 by default] + +The number of shufflings of the original dataset in order to evaluate +the edge specific confidence ratio of all inferred edges. +Default is 0: no confidence cut is applied. If the number of shufflings +is set to an integer > 0, the confidence threshold must also be > 0 +(e.g. \emph{n_shuffles} = 100 and \emph{conf_threshold} = 0.01).} + +\item{conf_threshold}{[a positive floating point, optional, 0 by default] + +The threshold used to filter the less probable edges following the skeleton +step. See Verny \emph{et al.}, PLoS Comp. Bio. 2017. +Default is 0: no confidence cut is applied. If the the confidence threshold +is set > 0, the number of shufflings must also be > 0 +(e.g. \emph{n_shuffles} = 100 and \emph{conf_threshold} = 0.01).} + +\item{sample_weights}{[a numeric vector, optional, NULL by default] + +An vector containing the weight of each observation. If defined, it must be +a vector of floats in the range [0,1] of size equal to the number of samples.} + +\item{test_mar}{[a boolean value, optional, TRUE by default] + +If set to TRUE, distributions with missing values will be tested with +Kullback-Leibler divergence: conditioning variables for the given link +\eqn{X - Y}, \eqn{Z} will be considered only if the divergence +between the full distribution and the non-missing distribution +\eqn{KL(P(X,Y) | P(X,Y)_{!NA})} is low enough (with \eqn{P(X,Y)_{!NA}} as +the joint distribution of \eqn{X} and \eqn{Y} on samples which are +not missing on Z. This is a way to ensure that data are missing at random for the considered -interaction and to avoid selection bias. Set to TRUE by default} - -\item{consistent}{[a string; \emph{c("no", "orientation", "skeleton")}] -if "orientation": iterate over skeleton and orientation steps to ensure -consistency of the network; -if "skeleton": iterate over skeleton step to get a consistent skeleton, then -orient edges and discard inconsistent orientations to ensure consistency of -the network. See (Li \emph{et al.}, NeurIPS 2019) for details.} - -\item{max_iteration}{[a positive integer] When the \emph{consistent} parameter -is set to "skeleton" or "orientation", the maximum number of iterations -allowed when trying to find a consistent graph. Set to 100 by default.} - -\item{consensus_threshold}{[a floating point between 0.5 and 1.0] When the -\emph{consistent} parameter is set to "skeleton" or "orientation", and when -the result graph is inconsistent, or is a union of more than one inconsistent -graphs, a consensus graph will be produced based on a pool of graphs. If the -result graph is inconsistent, then the pool is made of [max_iteration] graphs -from the iterations, otherwise it is made of those graphs in the union. In -the consensus graph, an edge is present when the proportion of non-zero +interaction and to avoid selection bias.} + +\item{consistent}{[a string, optional, "no" by default, possible values: +"no", "orientation", "skeleton"] + +If set to "orientation": iterate over skeleton and orientation steps to +ensure consistency of the network. +If set to "skeleton": iterate over skeleton step to get a consistent skeleton, +then orient edges and discard inconsistent orientations to ensure consistency +of the network (see Li \emph{et al.}, NeurIPS 2019 for details).} + +\item{max_iteration}{[a positive integer, optional, 100 by default] + +When the \emph{consistent} parameter is set to "skeleton" or "orientation", +the maximum number of iterations allowed when trying to find a consistent +graph.} + +\item{consensus_threshold}{[a floating point between 0.5 and 1.0, optional, +0.8 by default] + +When the \emph{consistent} parameter is set to "skeleton" or "orientation" +and when the result graph is inconsistent or is a union of more than +one inconsistent graphs, a consensus graph will be produced based on +a pool of graphs. +If the result graph is inconsistent, then the pool is made of +\emph{max_iteration} graphs from the iterations, otherwise it is made of +those graphs in the union. +In the consensus graph, an edge is present when the proportion of non-zero status in the pool is above the threshold. For example, if the pool contains [A, B, B, 0, 0], where "A", "B" are different status of the edge and "0" -indicates the absence of the edge. Then the edge is set to connected ("1") if -the proportion of non-zero status (0.6 in the example) is equal to or higher -than [consensus_threshold]. (When set to connected, the orientation of the -edge will be further determined by the average probability of orientation.) -Set to 0.8 by default.} - -\item{negative_info}{[a boolean value] For test purpose only. FALSE by -default. If TRUE, negative shifted mutual information is allowed during the -computation when mutual information is inferior to the complexity term. For -small dateset with complicated structures, e.g., discrete variables with many -levels, allowing for negative shifted mutual information may help identifying -weak v-structures related to those discrete variables, as the negative -three-point information in those cases will come from the difference between -two negative shifted mutual information terms (expected to be negative due to -the small sample size). However, under this setting, a v-structure (X -> Z <- -Y) in the final graph does not necessarily imply that X is dependent on Y -conditioning on Z, As a consequence, the interpretability of the final graph +indicates the absence of the edge. Then the edge is set to connected ("1") +if the proportion of non-zero status (0.6 in the example) is equal to +or higher than \emph{consensus_threshold}. (When set to connected, +the orientation of the edge will be further determined by the average +probability of orientation.)} + +\item{negative_info}{[a boolean value, optional, FALSE by default] + +For test purpose only. If TRUE, negative shifted mutual information is +allowed during the computation when mutual information is inferior to the +complexity term. +For small dataset with complicated structures, e.g. discrete variables with +many levels, allowing for negative shifted mutual information may help +identifying weak v-structures related to those discrete variables, +as the negative three-point information in those cases will come from +the difference between two negative shifted mutual information terms +(expected to be negative due to the small sample size). +However, under this setting, a v-structure (X -> Z <- Y) in the final graph +does not necessarily imply that X is dependent on Y conditioning on Z, +As a consequence, the interpretability of the final graph is hindered. In practice, it's advised to keep this parameter as FALSE.} -\item{verbose}{[a boolean value] If TRUE, debugging output is printed.} +\item{mode}{[a string, optional, "S" by default, possible values are +"S": Standard (IID samples) or "TS": Temporal Stationary"] + +When temporal mode is activated, the time information must be provided +in the first column of \emph{input_data}. For more details about temporal +stationary mode, see Simon \emph{et al.}, eLife reviewed preprint 2024.} + +\item{n_layers}{[an integer, optional, NULL by default, must be >= 2 +if supplied] + +Used only in temporal mode, \emph{n_layers} defines the number of layers +that will be considered for the variables in the time unfolded graph. +The layers will be distant of \emph{delta_t} time steps. +If not supplied, the number of layers is estimated from the dynamic of the +dataset and the maximum number of nodes \emph{max_nodes} allowed in the +final lagged graph.} + +\item{delta_t}{[an integer, optional, NULL by default, must be >= 1 +if supplied] + +Used only in temporal mode, \emph{delta_t} defines the number of time steps +between each layer. +i.e. on 1000 time steps with \emph{n_layers} = 3 and \emph{delta_t} = 7, +the time steps kept for the samples conversion will be 1, 8, 15 +for the first sample, the next sample will use 2, 9, 16 and so on. +If not supplied, the number of time steps between layers is estimated +from the dynamic of the dataset and the number of layers.} + +\item{movavg}{[an integer, optional, NULL by default, must be >= 2 +if supplied] + +Used only in temporal mode. When supplied, a moving average operation is +applied to all integer and numeric variables that are not contextual +variables.} + +\item{keep_max_data}{[a boolean value, optional, FALSE by default] + +Used only in temporal mode. If TRUE, rows where some NAs have been +introduced during the moving averages and lagging will be kept +whilst they will be dropped if FALSE.} + +\item{max_nodes}{[an integer, optional, 50 by default] + +Used only in temporal mode and if the \emph{n_layers} or \emph{delta_t} +parameters are not supplied. \emph{max_nodes} is used as the maximum number +of nodes in the final graph to compute \emph{n_layers} and/or \emph{delta_t}. +The default is 50 to produce quick runs and can be increased up to 200 +or 300 on recent computers to produce more precise results.} + +\item{verbose}{[a boolean value, optional, FALSE by default] + +If TRUE, debugging output is printed.} } \value{ A \emph{miic-like} object that contains: \itemize{ - \item{all.edges.summary:}{ a data frame with information about the relationship between + \item{\emph{all.edges.summary:} a data frame with information about the relationship between each pair of variables \itemize{ - \item \emph{x:} X node - \item \emph{y:} Y node - \item \emph{type:} contains 'N' if the edge has - been removed or 'P' for retained edges. If a true edges file is given, - 'P' becomes 'TP' (True Positive) or 'FP' (False Positive), while - 'N' becomes 'TN' (True Negative) or 'FN' (False Negative). - \item \emph{ai:} the contributing nodes found by the method which participate in - the mutual information between \emph{x} and \emph{y}, and possibly separate them. - \item \emph{info:} provides the pairwise mutual information times \emph{Nxyi} for - the pair (\emph{x}, \emph{y}). - \item \emph{info_cond:} provides the conditional mutual information times \emph{Nxy_ai} for - the pair (\emph{x}, \emph{y}) when conditioned on the collected nodes \emph{ai}. It is - equal to the \emph{info} column when \emph{ai} is an empty set. - \item \emph{cplx:} gives the computed complexity between the (\emph{x}, \emph{y}) - variables taking into account the contributing nodes \emph{ai}. Edges that have - have more conditional information \emph{info_cond} than \emph{cplx} are retained in the - final graph. - \item \emph{Nxy_ai:} gives the number of complete samples on which the information and - the complexity have been computed. If the input dataset has no missing value, the - number of samples is the same for all pairs and corresponds to the total - number of samples. - \item \emph{info_shifted:} represents the \emph{info} - \emph{cplx} value. - It is a way to quantify the strength of the edge (\emph{x}, \emph{y}). - \item \emph{confidenceRatio:} this column is present if the confidence cut - is > 0 and it represents the ratio between the probability to reject + \item{ \emph{x:} X node} + \item{ \emph{y:} Y node} + \item{ \emph{type:} contains 'N' if the edge has been removed or 'P' for + retained edges. If the true graph is supplied in the \emph{true_edges} + parameter, 'P' becomes 'TP' (True Positive) or 'FP' (False Positive), + while 'N' becomes 'TN' (True Negative) or 'FN' (False Negative). + Note that, as the \emph{all.edges.summary} does not contain all the + negative edges, edges not present are 'TN'.} + \item{ \emph{ai:} the contributing nodes found by the method which + participate in the mutual information between \emph{x} and \emph{y}, + and possibly separate them.} + \item{ \emph{raw_contributions:} describes the share of total mutual + information between \emph{x} and \emph{y} explained by each contributor.} + \item{ \emph{contributions:} describes the share of remaining mutual + information between \emph{x} and \emph{y} explained by each successive + contributors.} + \item{ \emph{info:} provides the pairwise mutual information times + \emph{Nxyi} for the pair (\emph{x}, \emph{y}).} + \item{ \emph{info_cond:} provides the conditional mutual information + times \emph{Nxy_ai} for the pair (\emph{x}, \emph{y}) when conditioned + on the collected nodes \emph{ai}. It is + equal to the \emph{info} column when \emph{ai} is an empty set.} + \item{ \emph{cplx:} gives the computed complexity between the (\emph{x}, + \emph{y}) variables taking into account the contributing nodes \emph{ai}. + Edges that have have more conditional information \emph{info_cond} + than \emph{cplx} are retained in the final graph.} + \item{ \emph{Nxy_ai:} gives the number of complete samples on which the + information and the complexity have been computed. If the input dataset + has no missing value, the number of samples is the same for all pairs + and corresponds to the total number of samples.} + \item{ \emph{info_shifted:} represents the \emph{info} - \emph{cplx} value. + It is a way to quantify the strength of the edge (\emph{x}, \emph{y}).} + \item{ \emph{infOrt:} the orientation of the edge (\emph{x}, \emph{y}). + It is the same value as in the adjacency matrix at row \emph{x} and + column \emph{y} : 1 for unoriented, 2 for an edge from X to Y, + -2 from Y to X and 6 for bidirectional.} + \item{ \emph{trueOrt:} the orientation of the edge (\emph{x}, \emph{y}) + present in the true edges are provided.} + \item{ \emph{isOrtOk:} information about the consistency of the inferred + graph’s orientations with a reference graph is given (if true edges + are provided). + 'Y': the orientation is consistent; 'N': the orientation is not consistent + with the PAG (Partial Ancestor Graph) derived from the given true graph.} + \item{ \emph{sign:} the sign of the partial correlation between variables + \emph{x} and \emph{y}, conditioned on the contributing nodes \emph{ai}.} + \item{ \emph{partial_correlation:} value of the partial correlation for the + edge (\emph{x}, \emph{y}) conditioned on the contributing nodes \emph{ai}.} + \item{ \emph{is_causal:} details about the nature of the arrow tip for a + directed edge. A directed edge in a causal graph does not necessarily imply + causation but it does imply that the cause-effect relationship is not the + other way around. An arrow-tip which is itself downstream of another + directed edge suggests stronger causal sense and is marked by a 'Y', + or 'N' otherwise.} + \item{ \emph{proba:} probabilities for the inferred orientation, derived + from the three-point mutual information (cf Affeldt & Isambert, UAI 2015 + proceedings) and noted as p(x->y);p(x<-y).} + \item{ \emph{confidence:} this column is computed when the confidence cut + is activated. It represents the ratio between the probability to reject the edge (\emph{x}, \emph{y}) in the dataset versus the mean probability - to do the same in multiple (user defined) number of randomized datasets. - \item \emph{infOrt:} the orientation of the edge (\emph{x}, \emph{y}). It is - the same value as in the adjacency matrix at row \emph{x} and column \emph{y} : 1 for - unoriented, 2 for an edge from X to Y, -2 from Y to X and 6 for bidirectional. - \item \emph{trueOrt:} the orientation of the edge (\emph{x}, \emph{y}) present - in the true edges file if provided. - \item \emph{isOrtOk:} information about the consistency of the inferred graph’s - orientations with a reference graph is given (i.e. if true edges file is provided). - Y: the orientation is consistent; N: the orientation is not consistent with - the PAG (Partial Ancestor Graph) derived from the given true graph. - \item \emph{sign:} the sign of the partial correlation between variables - \emph{x} and \emph{y}, conditioned on the contributing nodes \emph{ai}. - \item \emph{partial_correlation:} value of the partial correlation for the - edge (\emph{x}, \emph{y}) conditioned on the contributing nodes \emph{ai}. - \item \emph{isCausal:} details about the nature of the arrow tip for a directed - edge. A directed edge in a causal graph does not necessarily imply causation but it - does imply that the cause-effect relationship is not the other way around. An arrow-tip - which is itself downstream of another directed edge suggests stronger causal sense and is - marked by a 'Y', or 'N' otherwise. - \item \emph{proba:} probabilities for the inferred orientation, derived from the three-point - mutual information (cf Affeldt & Isambert, UAI 2015 proceedings) and noted as p(x->y);p(x<-y). + to do the same in multiple (user defined) number of randomized datasets.} } } - \item{orientations.prob:} {this data frame lists the orientation probabilities of the two edges of all unshielded triples - of the reconstructed network with the structure: node1 -- mid-node -- node2:} + \item{\emph{orientations.prob:} this data frame lists the orientation + probabilities of the two edges of all unshielded triples of the + reconstructed network with the structure: node1 -- mid-node -- node2: \itemize{ - \item node1: node at the end of the unshielded triplet - \item p1: probability of the arrowhead node1 <- mid-node - \item p2: probability of the arrowhead node1 -> mid-node - \item mid-node: node at the center of the unshielded triplet - \item p3: probability of the arrowhead mid-node <- node2 - \item p4: probability of the arrowhead mid-node -> node2 - \item node2: node at the end of the unshielded triplet - \item NI3: 3 point (conditional) mutual information * N + \item{ \emph{node1:} node at the end of the unshielded triplet} + \item{ \emph{p1:} probability of the arrowhead node1 <- mid-node} + \item{ \emph{p2:} probability of the arrowhead node1 -> mid-node} + \item{ \emph{mid-node:} node at the center of the unshielded triplet} + \item{ \emph{p3:} probability of the arrowhead mid-node <- node2} + \item{ \emph{p4:} probability of the arrowhead mid-node -> node2} + \item{ \emph{node2:} node at the end of the unshielded triplet} + \item{ \emph{NI3:} 3 point (conditional) mutual information * N} + \item{ \emph{Conflict:} indicates if there a conflict between the + computed probabilities and the NI3 value} + } } - \item {adj_matrix:} the adjacency matrix is a square matrix used to represent - the inferred graph. The entries of the matrix indicate whether pairs of - vertices are adjacent or not in the graph. The matrix can be read as a - (row, column) set of couples where the row represents the source node and - the column the target node. Since miic can reconstruct mixed networks + \item {\emph{adj_matrix:} the adjacency matrix is a square matrix used to + represent the inferred graph. The entries of the matrix indicate whether + pairs of vertices are adjacent or not in the graph. The matrix can be read + as a (row, column) set of couples where the row represents the source node + and the column the target node. Since miic can reconstruct mixed networks (including directed, undirected and bidirected edges), we will have a different digit for each case: \itemize{ - \item 1: (\emph{x}, \emph{y}) edge is undirected - \item 2: (\emph{x}, \emph{y}) edge is directed as \emph{x} -> \emph{y} - \item -2: (\emph{x}, \emph{y}) edge is directed as \emph{x} <- \emph{y} - \item 6: (\emph{x}, \emph{y}) edge is bidirected + \item{ 1: (\emph{x}, \emph{y}) edge is undirected} + \item{ 2: (\emph{x}, \emph{y}) edge is directed as \emph{x} -> \emph{y} } + \item{ -2: (\emph{x}, \emph{y}) edge is directed as \emph{x} <- \emph{y} } + \item{ 6: (\emph{x}, \emph{y}) edge is bidirected} + } + } + + \item {\emph{proba_adj_matrix:} the probability adjacency matrix is + a square matrix used to represent the orientation probabilities associated + to the edges tips. The value at ("row", "column") is the probability, + for the edge between "row" and "column" nodes, of the edge tip on the "row" + side. A probability less than 0.5 is an indication of a possible tail + (cause) and a probability greater than 0.5 a possible head (effect). } - \item {params:} the list of parameters used for the network reconstruction. - The parameters not supplied are initialized to their default values. - Otherwise, the parameters are checked and corrected if necessary. + \item {\emph{params:} the list of parameters used for the network + reconstruction. The parameters not supplied are initialized to their default + values. Otherwise, the parameters are checked and corrected if necessary.} - \item {state_order:} the state order used for the network reconstruction. - If no state order is supplied, it is generated by using default values. - Otherwise, it is the state order checked and corrected if necessary. + \item {\emph{state_order:} the state order used for the network + reconstruction. If no state order is supplied, it is generated by using + default values. Otherwise, it is the state order checked and corrected + if necessary.} - \item {black_box:} present only if a black box has been supplied: + \item {\emph{black_box:} present only if a black box has been supplied, the black box, checked and corrected if necessary, used for the network -reconstruction. +reconstruction.} - \item {true_edges:} present only if the true edges have been supplied, + \item {\emph{true_edges:} present only if the true edges have been supplied, the true edges, checked and corrected if necessary, used for the network -evaluation. +evaluation.} + + \item {\emph{tmiic:} present only in temporal mode. + Named list containing the full list of edges completed by stationarity, + the lagged state order and, if a black box or true edges have been supplied, + the lagged versions of these inputs.} } } \description{ @@ -303,12 +490,21 @@ from indirect effects amongst correlated variables, including cause-effect relationships and the effect of unobserved latent causes. } \details{ -Starting from a complete graph, the method iteratively removes +In standard mode, starting from a complete graph, the method iteratively removes dispensable edges, by uncovering significant information contributions from indirect paths, and assesses edge-specific confidences from randomization of available data. The remaining edges are then oriented based on the signature of causality in observational data. +In temporal mode, miic reorganizes the dataset using the \emph{n_layers} and +\emph{delta_t} parameters to transform the time steps into lagged samples. +As starting point, a lagged graph is created with only edges having at +least one node laying on the last time step. +Then, miic standard algorithm is applied to remove dispensable edges. +The remaining edges are then oriented by using the temporality and the +signature of causality in observational data. The use of temporal mode +is exposed in Simon \emph{et al.}, eLife reviewed preprint 2024. + The method relies on an information theoretic based (conditional) independence test which is described in (Verny \emph{et al.}, PLoS Comp. Bio. 2017), (Cabeli \emph{et al.}, PLoS Comp. Bio. 2020). It deals with both categorical @@ -379,14 +575,37 @@ if(require(igraph)) { # write graph to graphml format. Note that to correctly visualize # the network we created the miic style for Cytoscape (http://www.cytoscape.org/). miic.write.network.cytoscape(g = miic.res, file = file.path(tempdir(), "temp")) + +# EXAMPLE COVID CASES (time series demo) +data(covidCases) +# execute MIIC (reconstruct graph in temporal mode) +tmiic.res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) + +# to plot the default graph (compact) +if(require(igraph)) { + plot(tmiic.res) +} + +# to plot the raw temporal network Using igraph +if(require(igraph)) { + plot(tmiic.res, display="raw") +} + +# to plot the full temporal network Using igraph +if(require(igraph)) { + plot(tmiic.res, display="lagged") +} + } } \references{ \itemize{ -\item Verny et al., \emph{PLoS Comp. Bio. 2017.} https://doi.org/10.1371/journal.pcbi.1005662 -\item Cabeli et al., \emph{PLoS Comp. Bio. 2020.} https://doi.org/10.1371/journal.pcbi.1007866 -\item Li et al., \emph{NeurIPS 2019} http://papers.nips.cc/paper/9573-constraint-based-causal-structure-learning-with-consistent-separating-sets.pdf +\item{Simon et al., \emph{eLife reviewed preprint} https://www.biorxiv.org/content/10.1101/2024.02.06.579177v1.abstract } +\item{Ribeiro-Dantas et al., \emph{iScience 2024} https://arxiv.org/abs/2303.06423 } +\item{Cabeli et al., \emph{PLoS Comp. Bio. 2020.} https://doi.org/10.1371/journal.pcbi.1007866 } +\item{Li et al., \emph{NeurIPS 2019} http://papers.nips.cc/paper/9573-constraint-based-causal-structure-learning-with-consistent-separating-sets.pdf } +\item{Verny et al., \emph{PLoS Comp. Bio. 2017.} https://doi.org/10.1371/journal.pcbi.1005662 } } } \seealso{ diff --git a/man/plot.tmiic.Rd b/man/plot.tmiic.Rd new file mode 100644 index 00000000..3943a875 --- /dev/null +++ b/man/plot.tmiic.Rd @@ -0,0 +1,145 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tmiic.plot.R +\name{plot.tmiic} +\alias{plot.tmiic} +\title{Basic plot function of a temporal miic (tmiic) network inference result} +\usage{ +\method{plot}{tmiic}( + x, + display = "compact", + show_self_loops = TRUE, + positioning_for_grid = "greedy", + orientation_for_grid = "L", + method = "igraph", + pcor_palette = NULL, + ... +) +} +\arguments{ +\item{x}{[a tmiic graph object] +The graph object returned by \code{\link{miic}} in temporal mode} + +\item{display}{[a string]. Optional, default value "compact". +Possible values are \emph{"raw"}, \emph{"lagged"}, \emph{"compact"}, +\emph{"combine"}, \emph{"unique"}, \emph{"drop"}: +\itemize{ +\item When \emph{display} = \emph{"raw"}, the plot function will + use the tmiic graph object as it, leading to the display of a lagged + graph. Unless a specific layout is specified, nodes will be positioned + on a grid. +\item When \emph{display} = \emph{"lagged"}, the function will + repeat the edges over history assuming stationarity and plot a lagged + graph. Unless a specific layout is specified, nodes will be positioned + on a grid. +\item When \emph{display} = \emph{"compact"}, the default, nodes + and edges are converted into a flattened version to produce a compact + view of the temporal network whilst still presenting all the information + in the plotting.\cr + i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 become respectively X->Y lag=1, + X<-Y lag=2. +\item When \emph{display} = \emph{"combine"}, prior to the plotting, + a preprocessing will be applied to kept only one edge + per couple of nodes. The info_shifted will be the highest one + of the summarized edges whilst the lag and orientation of the + summarized edge will be an aggregation.\cr + i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 will become X<->Y lag=1,2 with + the info_shifted of X_lag1->Y_lag0 if info_shifted of + X_lag1->Y_lag0 > X_lag2<-Y_lag0. +\item When \emph{display} = \emph{"unique"}, prior to the plotting, + a preprocessing will be applied to kept only the edges having the + highest info_shifted for a couple of nodes. + If several edges between the sames nodes have the same + info_shifted, then the edge kept is the one with the minimum lag.\cr + i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 with info_shifted of + X_lag1->Y_lag0 > X_lag2<-Y_lag0 become X->Y lag=1. +\item When \emph{display} = \emph{"drop"}, prior to the plotting, + a preprocessing will be applied to kept only the edges having the + highest info_shifted for a couple of nodes. + If several edges between the sames nodes have the same + info_shifted, then the edge kept is the one with the minimum lag.\cr + i.e. : X_lag1->Y_lag0, X_lag2<-Y_lag0 with info_shifted of + X_lag1->Y_lag0 > X_lag2<-Y_lag0 become X->Y. + The lag information is dropped during the preprocessing and + will not be displayed on the final plotting. +}} + +\item{show_self_loops}{[a boolean] Optional, TRUE by default. +When TRUE, the edges like X_lag0-X_lag1 are included in the iGraph object. +When FALSE, only edges having different nodes are present in the iGraph +object.} + +\item{positioning_for_grid}{[a string] Optional, "greedy" by default. +Used only when the display is "raw" or "lagged and no layout is supplied. +Possible values are \emph{"none"}, \emph{"alphabetical"}, \emph{"layers"} +\emph{"greedy"} and \emph{"sugiyama"} +\itemize{ +\item When \emph{positioning_for_grid} = \emph{"none"} + The nodes are positioned as they appear in the miic result +\item When \emph{positioning_for_grid} = \emph{"alphabetical"} + The nodes are positioned alphabeticaly in ascending order +\item When \emph{positioning_for_grid} = \emph{"layers"} + The nodes with the less lags wil be placed on the exteriors + while the nodes having the most lags are in the center +\item When \emph{positioning_for_grid} = \emph{"greedy"} + A greedy algorithm will be used to placed the nodes in a way minimizing + the crossing edges +\item When \emph{positioning_for_grid} = \emph{"sugiyama"} + The sugiyama algorithm will be used to placed the nodes in a way + minimizing the crossing edges +}} + +\item{orientation_for_grid}{[a string] Optional, "L" by default. +Used only when the display is "raw" or "lagged and no layout is supplied. +Indicates the orientation of the draw, possible values are landscape: "L" +or portrait: "P".} + +\item{method}{A string representing the plotting method. Default to "igraph". +Currently only "igraph" is supported.} + +\item{pcor_palette}{Optional. The color palette used to represent the partial +correlations (the color of the edges). See \code{\link{getIgraph}} for details.} + +\item{\dots}{Additional plotting parameters. See the corresponding plot function +for the complete list. + +For igraph, see \code{\link[igraph]{igraph.plotting}}.} +} +\description{ +This function calls \code{\link{tmiic.export}} to build a +plottable object from the result returned by \code{\link{miic}} in +temporal mode and plot it. +} +\details{ +See the documentation of \code{\link{tmiic.export}} for further +details. +} +\examples{ +\donttest{ +library(miic) + +#' # EXAMPLE COVID CASES (time series demo) +data(covidCases) +# execute MIIC (reconstruct graph in temporal mode) +tmiic_res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) + +# to plot the default compact graph +if(require(igraph)) { + plot(tmiic_res) +} + +# to plot the raw temporal network Using igraph +if(require(igraph)) { + plot(tmiic_res, display="raw") +} + +# to plot the full temporal network Using igraph +if(require(igraph)) { + plot(tmiic_res, display="lagged") +} + +} +} +\seealso{ +\code{\link{tmiic.export}} for generic exports, +\code{\link[igraph]{igraph.plotting}} +} diff --git a/man/tmiic.export.Rd b/man/tmiic.export.Rd new file mode 100644 index 00000000..5872415d --- /dev/null +++ b/man/tmiic.export.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tmiic.plot.R +\name{tmiic.export} +\alias{tmiic.export} +\title{Export temporal miic (tmiic) result to different plotting methods} +\usage{ +tmiic.export( + tmiic_res, + display = "compact", + show_self_loops = TRUE, + method = "igraph", + pcor_palette = NULL +) +} +\arguments{ +\item{tmiic_res}{[a tmiic object] +The object returned by the \code{\link{miic}} execution in temporal mode.} + +\item{display}{[a string]. Optional, default value "compact". +Possible values are \emph{"raw"}, \emph{"lagged"}, \emph{"compact"}, +\emph{"combine"}, \emph{"unique"}, \emph{"drop"}: +\itemize{ +\item When \emph{display} = \emph{"raw"}, the export function will + use the tmiic graph object as it, leading to the return of a lagged + graph. +\item When \emph{display} = \emph{"lagged"}, the export function will + repeat the edges over history assuming stationarity and return a lagged + graph. +\item When \emph{display} = \emph{"compact"}, the default, nodes + and edges are converted into a flattened version to produce a compact + view of the temporal network whilst still presenting all the information + in the export.\cr + i.e.: X_lag1->Y_lag0, X_lag2<-Y_lag0 become respectively X->Y lag=1, + X<-Y lag=2. +\item When \emph{display} = \emph{"combine"}, prior to the export, + a pre-processing will be applied to kept only one edge + per couple of nodes. The info_shifted will be the highest one + of the summarized edges whilst the lag and orientation of the + summarized edge will be an aggregation.\cr + i.e.: X_lag2->Y_lag0, X_lag0<-Y_lag1 will become X<->Y lag=1-2 with + the info_shifted of X_lag2->Y_lag0 if info_shifted of + X_lag2->Y_lag0 > X_lag0<-Y_lag1. +\item When \emph{display} = \emph{"unique"}, prior to the export, + a pre-processing will be applied to kept only the edges having the + highest info_shifted for a couple of nodes. + If several edges between the sames nodes have the same + info_shifted, then the edge kept is the one with the minimum lag.\cr + i.e.: X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of + X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y lag=1. +\item When \emph{display} = \emph{"drop"}, prior to the export, + a pre-processing will be applied to kept only the edges having the + highest info_shifted for a couple of nodes. + If several edges between the sames nodes have the same + info_shifted, then the edge kept is the one with the minimum lag.\cr + i.e. : X_lag1->Y_lag0, X_lag0<-Y_lag2 with info_shifted of + X_lag1->Y_lag0 > X_lag0<-Y_lag2 become X->Y. + The lag information is dropped during the preprocessing and + will not be exported. +}} + +\item{show_self_loops}{[a boolean] Optional, TRUE by default. +When TRUE, the edges like X_lag0-X_lag1 are exported. +When FALSE, only edges having different nodes are exported.} + +\item{method}{A string representing the plotting method. +Currently only "igraph" is supported.} + +\item{pcor_palette}{Optional. The color palette used to represent the partial +correlations (the color of the edges). See \code{\link{getIgraph}} for details.} +} +\value{ +A graph object adapted to the method. +} +\description{ +This function creates an object built from the result returned +by \code{\link{miic}} executed in temporal mode that is ready to be fed to +different plotting methods. +} +\examples{ +\donttest{ +library(miic) +data(covidCases) +# execute MIIC (reconstruct graph in temporal mode) +tmiic_res <- miic(input_data = covidCases, mode = "TS", n_layers = 3, delta_t = 1, movavg = 14) + +# Plot default compact temporal network Using igraph +if(require(igraph)) { +g = tmiic.export(tmiic_res, method="igraph") +plot(g) # Default visualisation, calls igraph::plot.igraph() + +# Plot raw temporal network Using igraph +g = tmiic.export(tmiic_res, display="raw", method="igraph") +plot(g) # Default visualisation, calls igraph::plot.igraph() + +# Plot full temporal network Using igraph +g = tmiic.export(tmiic_res, display="lagged", method="igraph") +plot(g) # Default visualisation, calls igraph::plot.igraph() + +# Specifying layout (see ?igraph::layout_) +l <- layout_on_grid(g, width = 5, height = 3, dim = 2) +plot(g, layout=l) + +# Override some graphical parameters +plot(g, edge.arrow.size = 0.75) +plot(g, vertex.shape="none", edge.color="gray85", vertex.label.color="gray10") + +# For compact graphs, please be aware that the rendering of +# igraph::plot.igraph() is not optimal when the graph contains +# multiple edges between the same nodes. +# So the recommend way to plot a compact graph is to use tmiic plotting: +plot(tmiic_res) +} + +} +} diff --git a/src/computation_continuous.cpp b/src/computation_continuous.cpp index 0d011588..010fc9a9 100755 --- a/src/computation_continuous.cpp +++ b/src/computation_continuous.cpp @@ -770,11 +770,13 @@ InfoBlock computeIxyui(const TempGrid2d& data, } int n_test_max = min(min(initbins, 20), n_levels_min); - if (std::pow (n_test_max-1, n_ui) >= INT_MAX) { - n_test_max = std::pow (INT_MAX, 1.0 / n_ui) + 1; - Rcpp::Rcout << "Note: Initial number of bins has been limited to " - << n_test_max-1 << " for " << n_ui << " contributors to avoid overflow\n"; - } + // FRS 4 jan 2024: remove fix to limit number of joint factors + // if (std::pow (n_test_max-1, n_ui) >= INT_MAX) + // { + // n_test_max = std::pow (INT_MAX, 1.0 / n_ui) + 1; + // Rcpp::Rcout << "Note: Initial number of bins has been limited to " + // << n_test_max-1 << " for " << n_ui << " contributors to avoid overflow\n"; + // } TempVector r_temp(3); InfoBlock res_temp; for (int test_n_bins = 2; test_n_bins < n_test_max; ++test_n_bins) { diff --git a/src/environment.h b/src/environment.h index dfe96ed4..f9696311 100644 --- a/src/environment.h +++ b/src/environment.h @@ -17,11 +17,23 @@ using computation::CompCache; struct Environment { int n_samples; int n_nodes; + // Used only in temporal mode, number of nodes at lag0 or not lagged + int n_nodes_not_lagged = -1; + Grid2d data_numeric; Grid2d data_double; // data_numeric_idx(j, i) = index of i'th smallest value in data_double(j, ) Grid2d data_numeric_idx; - + // Store the miic's mode: 0=Standard (IID), 1=Temporal (stationary) + // (use an int to be ready for addition of future modes) + int mode = 0; + // As we foresee to have a stationary and a non stationary temporal version, + // this flag is true if we are in a temporal mode, whatever stationarity + bool temporal = false; + + // Identify if any node is marked as contextual + bool any_contextual = false; + // For each node, contains 0 = not a contextual node or 1 = contextual node vector is_contextual; // Identify if any node is marked as consequence bool any_consequence = false; @@ -81,6 +93,26 @@ struct Environment { ExecutionTime exec_time; int n_threads = 1; CompCache cache; + + // Max number of layers (temporal mode only) + // Even if not recommended, the number of layers can be different for each + // variables, layer_max is the maximum number of layers for all variables + int layer_max = -1; + // Number of layers for each variable (temporal mode only) + vector list_n_layers; + // Class for each node (temporal mode only) + vector nodes_class; + // Lag for each node (temporal mode only) + // Note that we consider contextual variables as very old (= INT_MAX) + // so from the time point of view, they are never the consequence + // of another variable + vector nodes_lags; + // Store nodes index shift, giving for each node the same lagged node + // (i.e.: variables: x_lag0, ctr_var, y_lag0, x_lag1, y_lag1 + // => nodes_shifts: 3 , 0 , 2 , 0 , 0) + // (temporal mode only) + vector nodes_shifts; + bool verbose = false; Environment(int n_samples, int n_nodes, vector vec_numeric, diff --git a/src/get_information.cpp b/src/get_information.cpp index 64949bec..d1514878 100644 --- a/src/get_information.cpp +++ b/src/get_information.cpp @@ -301,6 +301,17 @@ void searchForBestContributingNode( zi_list.erase( remove_if(begin(zi_list), end(zi_list), is_isolated), end(zi_list)); } + if (environment.temporal) { + // remove Zi that are the same class as X if Y is contextual, and vice versa + auto contextually_illegal_Z = [&environment, X, Y](int Z) { + return (environment.is_contextual[X] && + (environment.nodes_class[Y] == environment.nodes_class[Z])) || + (environment.is_contextual[Y] && + (environment.nodes_class[X] == environment.nodes_class[Z])); + }; + zi_list.erase( + remove_if(begin(zi_list), end(zi_list), contextually_illegal_Z), end(zi_list)); + } int n_zi = zi_list.size(); // zero corresponds to 1/2 as the full score as defined in Verny et al., 2017 diff --git a/src/mutual_information.cpp b/src/mutual_information.cpp index 16e445ca..edaf9de8 100644 --- a/src/mutual_information.cpp +++ b/src/mutual_information.cpp @@ -126,8 +126,9 @@ int fillHashList(const structure::TempGrid2d& data, for (const auto u : ui_list) { r_joint_list[u] = n_levels_product; n_levels_product *= r_list[u]; - if (n_levels_product < 0) - Rcpp::stop ("Maximum number of levels for joint factors exceeded.\nPlease raise an issue on the MIIC github.\n"); + // FRS 4 jan 2024: remove fix to limit number of joint factors + // if (n_levels_product < 0) + // Rcpp::stop ("Maximum number of levels for joint factors exceeded.\nPlease raise an issue on the MIIC github.\n"); } for (int i = 0; i < n_samples; ++i) { diff --git a/src/orientation.cpp b/src/orientation.cpp index 448dcc69..44475c7d 100644 --- a/src/orientation.cpp +++ b/src/orientation.cpp @@ -1,3 +1,4 @@ +#include #include "orientation.h" #ifdef _OPENMP @@ -12,6 +13,7 @@ #include "get_information.h" #include "linear_allocator.h" #include "proba_orientation.h" +#include "tmiic.h" namespace miic { namespace reconstruction { @@ -31,6 +33,8 @@ bool acceptProba(double proba, double ori_proba_ratio) { return (1 - proba) / proba < ori_proba_ratio; } +} // anonymous namespace + // y2x: probability that there is an arrow from node y to x // x2y: probability that there is an arrow from node x to y void updateAdj(Environment& env, int x, int y, double y2x, double x2y) { @@ -43,31 +47,50 @@ void updateAdj(Environment& env, int x, int y, double y2x, double x2y) { } //----------------------------------------------------------------------------- -// completeOrientationUsingConsequence +// completeOrientationUsingPrior //----------------------------------------------------------------------------- -// Description: Complete the orientations using the consequence information -// for edges that were not previously oriented +// Description: complete the orientations using the prior knowledge of +// contextual and consequence for edges that were not previously oriented // -// Detail: completeOrientationUsingConsequence will look in the list of connected -// edges the ones that have not been oriented using the unshielded triples and -// have at least one variable tagged as consequence. -// Edges matching these criteria will be oriented using consequence information: -// from the non consequence variable to the consequence variable +// Detail: completeOrientationUsingPrior will look in the list of connected +// edges the ones that have not been oriented using the unshielded triples +// and have at least one variable tagged as contextual or consequence. +// Edges matching these criteria will be updated using prior knowledge: +// - tails on contextual variables will be enforced (edge tip proba = 0) +// - head toward the consequence variable (edge tip proba = 1) // // Params: // - Environment&: the environment structure // - std::vector& : list of unshielded triples //-------------------------------------------------------------------------------- -void completeOrientationUsingConsequence (Environment& environment, - const std::vector& triples) { +void completeOrientationUsingPrior (Environment& environment, + const std::vector& triples) + { + // Base tail probability to use for edges with one node as consequence differs + // if latent variables are authorized or not: + // - 0 if no latent var, we are sure that the not consequence node is the cause + // - 0.5 with latent var as we can not be sure that the not consequence node + // is the cause + // + double tail_proba = 0; + if (environment.latent_orientation) + tail_proba = 0.5; + // + // Loop over edges to find edges that were not considered when orienting with + // open triples but can be oriented using consequence nodes + // const auto& edge_list = environment.connected_list; - for (auto iter0 = begin(edge_list); iter0 != end(edge_list); ++iter0) { + for (auto iter0 = begin(edge_list); iter0 != end(edge_list); ++iter0) + { int posX = iter0->X, posY = iter0->Y; // - // If the edge has no variable tagged as consequence, nothing to do + // If the edge has no variable tagged as contextual or consequence, + // nothing to do // - if ( (!environment.is_consequence[posX]) && - (!environment.is_consequence[posY]) ) + if ( (!environment.is_contextual[posX]) + && (!environment.is_contextual[posY]) + && (!environment.is_consequence[posX]) + && (!environment.is_consequence[posY]) ) continue; // // If edge is in triple, head/tail probas have already been computed @@ -78,48 +101,91 @@ void completeOrientationUsingConsequence (Environment& environment, if ( ( (triples[i][0] == posX) && (triples[i][1] == posY) ) || ( (triples[i][0] == posY) && (triples[i][1] == posX) ) || ( (triples[i][1] == posX) && (triples[i][2] == posY) ) - || ( (triples[i][1] == posY) && (triples[i][2] == posX) ) ) { + || ( (triples[i][1] == posY) && (triples[i][2] == posX) ) ) + { is_in_triple = true; break; - } + } if (is_in_triple) continue; // - // The edge is not in open triples, has a consequence variable - // => we need to orient it using the consequence information - // edge orientation is other var -> consequence only var + // The edge is not in open triples, has a contextual or consequence variable + // => we need to update the probabilities // if (environment.is_consequence[posY]) - updateAdj(environment, posX, posY, 0, 1); - else - updateAdj(environment, posX, posY, 1, 0); + { + if (environment.is_contextual[posX]) + updateAdj(environment, posX, posY, 0, 1); + else + updateAdj(environment, posX, posY, tail_proba, 1); + continue; + } + if (environment.is_consequence[posX]) + { + if (environment.is_contextual[posY]) + updateAdj(environment, posX, posY, 1, 0); + else + updateAdj(environment, posX, posY, 1, tail_proba); + continue; + } + // + // Case with consequence variable done => at least one of the 2 vars is + // contextual and the other one is not consequence + // + if (environment.is_contextual[posX]) + { + if (environment.is_contextual[posY]) + updateAdj(environment, posX, posY, 0, 0); + else + updateAdj(environment, posX, posY, 0, 0.5); + continue; + } + if (environment.is_contextual[posY]) + updateAdj(environment, posX, posY, 0.5, 0); + } } -} - -} // anonymous namespace vector> orientationProbability(Environment& environment) { - // Get all unshielded triples X -- Z -- Y vector triples; + // + // In regular mode or non stationary temporal node, + // get all unshielded triples X -- Z -- Y + // + // In temporal stationary mode, get only unshielded triples X -- Z -- Y having at least + // X or Y on the layer 0 (not lagged or lag0). We apply this filter on triples + // because we do not want to orient past only triples (they are just a consequence + // of repeating the edges over history when latent discovery is activated) + // and triples with only Z on the layer 0 are excluded as they are problematic + // because we have no information on the separating set of the pair X_lagA-Y_lagB, + // leading to a wrong computation of the NI3. + // + int n_nodes_nl = environment.n_nodes_not_lagged; const auto& edge_list = environment.connected_list; for (auto iter0 = begin(edge_list); iter0 != end(edge_list); ++iter0) { int posX = iter0->X, posY = iter0->Y; - for (auto iter1 = iter0 + 1; iter1 != end(edge_list); ++iter1) { int posX1 = iter1->X, posY1 = iter1->Y; - if (posY1 == posX && !environment.edges(posY, posX1).status) + + if ( posY1 == posX && !environment.edges(posY, posX1).status + && (environment.mode != 1 || posY < n_nodes_nl || posX1 < n_nodes_nl) ) triples.emplace_back(Triple{posY, posX, posX1}); - else if (posY1 == posY && !environment.edges(posX, posX1).status) + else if ( posY1 == posY && !environment.edges(posX, posX1).status + && (environment.mode != 1 || posX < n_nodes_nl || posX1 < n_nodes_nl) ) triples.emplace_back(Triple{posX, posY, posX1}); - if (posX1 == posX && !environment.edges(posY, posY1).status) + if ( posX1 == posX && !environment.edges(posY, posY1).status + && (environment.mode != 1 || posY < n_nodes_nl || posY1 < n_nodes_nl) ) triples.emplace_back(Triple{posY, posX, posY1}); - else if (posX1 == posY && !environment.edges(posX, posY1).status) + else if ( posX1 == posY && !environment.edges(posX, posY1).status + && (environment.mode != 1 || posX < n_nodes_nl || posY1 < n_nodes_nl) ) triples.emplace_back(Triple{posX, posY, posY1}); } } + if (triples.empty()) { - if (environment.any_consequence) - completeOrientationUsingConsequence (environment, triples); + if ( (environment.any_contextual) || (environment.any_consequence) ) + completeOrientationUsingPrior (environment, triples); + if (environment.temporal) + tmiic::completeOrientationUsingTime (environment, triples); return vector>(); } @@ -142,7 +208,9 @@ vector> orientationProbability(Environment& environment) { vector probas_list = getOriProbasList(triples, I3_list, environment.is_contextual, environment.is_consequence, environment.latent_orientation, environment.degenerate, - environment.propagation, environment.half_v_structure); + environment.propagation, environment.half_v_structure, + environment.temporal, environment.nodes_lags); + // update probas_list for possible inconsistencies class ProbaArrayMap : public std::map, double> { public: @@ -167,6 +235,55 @@ vector> orientationProbability(Environment& environment) { proba_map.insert_or_update({triple[2], triple[1]}, probas[2]); // 2 -> 1 proba_map.insert_or_update({triple[1], triple[2]}, probas[3]); // 1 -> 2 } + // + // In temporal stationary mode, when edges have been duplicated over history + // keep the max proba per group of duplicated edges + // + if ( (environment.mode == 1) && (environment.latent_orientation) ) + for (auto it = proba_map.begin(); it != proba_map.end(); it++) + { + int node1_pos = (it->first).first; + int node2_pos = (it->first).second; + std::vector< std::pair > list_lagged = tmiic::getListLaggedEdges + (environment, node1_pos, node2_pos); + for (auto const& it_lagged : list_lagged) + { + auto proba_lagged_it = proba_map.find (it_lagged); + if ( proba_lagged_it != proba_map.end() ) + { + double proba_lag = proba_lagged_it->second; + string str_warn = ""; + if ( ((it->second - 0.5 < 0) && (proba_lag - 0.5 > 0)) + || ((it->second - 0.5 > 0) && (proba_lag - 0.5 < 0)) ) + { + str_warn = "Warning: Discrepancy when computing orientation of " + + environment.nodes[node1_pos].name + + " - " + environment.nodes[node2_pos].name + + ": proba=" + std::to_string (it->second) + "\n"; + int node1_lagged = (proba_lagged_it->first).first; + int node2_lagged = (proba_lagged_it->first).second; + str_warn += " -> Found conflict with lagged edge " + + environment.nodes[node1_lagged].name + + " - " + environment.nodes[node2_lagged].name + + ": proba=" + std::to_string (proba_lag) + "\n"; + } + if (fabs(it->second - 0.5) < fabs(proba_lag - 0.5)) + { + it->second = proba_lag; + if (str_warn.length() > 0) + str_warn += " -> Probability updated to=" + std::to_string (it->second) + "\n"; + } + else + { + if (str_warn.length() > 0) + str_warn += " -> Initial probability kept (no update)\n"; + } + if ( (environment.verbose) && (str_warn.length() > 0) ) + Rcpp::warning (str_warn); + } + } + } + // Update probabilities std::transform(begin(triples), end(triples), begin(probas_list), [&proba_map](const auto& triple) { @@ -186,8 +303,14 @@ vector> orientationProbability(Environment& environment) { // If we have consequence variables, we look if we can orient some extra // edges not part of an open triple by using the consequence information // - if (environment.any_consequence) - completeOrientationUsingConsequence (environment, triples); + if ( (environment.any_contextual) || (environment.any_consequence) ) + completeOrientationUsingPrior (environment, triples); + // + // In temporal mode, we add in adj matrix the orientation of temporal edges + // that were not already oriented (edges was not part of an open triple) + // + if (environment.temporal) + tmiic::completeOrientationUsingTime (environment, triples); // Write output vector> orientations{{"source1", "p1", "p2", "target", "p3", @@ -214,6 +337,7 @@ vector> orientationProbability(Environment& environment) { environment.nodes[triple[2]].name, to_string(I3_list[i]), conflict}); } + return orientations; } diff --git a/src/orientation.h b/src/orientation.h index cf96f035..f4116773 100644 --- a/src/orientation.h +++ b/src/orientation.h @@ -9,6 +9,7 @@ namespace miic { namespace reconstruction { +void updateAdj(structure::Environment&, int, int, double, double); std::vector> orientationProbability( structure::Environment&); diff --git a/src/proba_orientation.cpp b/src/proba_orientation.cpp index d2fc883b..22d68fb0 100644 --- a/src/proba_orientation.cpp +++ b/src/proba_orientation.cpp @@ -114,7 +114,8 @@ void induceScore( vector getOriProbasList(const vector& triples, const vector& I3_list, const vector& is_contextual, const vector& is_consequence, bool latent, bool degenerate, - bool propagation, bool half_v_structure) { + bool propagation, bool half_v_structure, + bool temporal, const vector& nodes_lags) { // A score is a quantity almost proportional to abs(I3). All probabilities can // be expressed in the form of 1 / (1 + exp(-score)), and they suffer from // loss of numerical precision for high score due to the exponential term. @@ -129,6 +130,7 @@ vector getOriProbasList(const vector& triples, vector scores(n_triples); // Rank is the highest absolute value of unsettled score for each triple. vector rank(n_triples, 0); + for (int i = 0; i < n_triples; ++i) { // Initialize scores of v-structure if (I3_list[i] < 0) { @@ -148,6 +150,47 @@ vector getOriProbasList(const vector& triples, scores[i][3].value = -scores[i][2].value; } } + // + // In temporal mode, use time for orientation + // + if (temporal) { + // + // Orient edge using time when the edge is lagged. + // The head side is always sure because the time flows from oldest + // to newest node, so the newest node is associated to kScoreMax. + // The tail side will be set only if latent variable is not activated, + // then the oldest node is associated to kScoreLowest. + // (when latent variable is activated, we do nothing because + // we can not assume that the oldest node is a tail: there can be + // an edge going from the latent variable to the oldest node) + // + int nodeX_lag = nodes_lags[ triples[i][0] ]; + int nodeZ_lag = nodes_lags[ triples[i][1] ]; + int nodeY_lag = nodes_lags[ triples[i][2] ]; + + if (nodeX_lag < nodeZ_lag) { + scores[i][0] = ProbaScore{kScoreMax, true}; + if (!latent) + scores[i][1] = ProbaScore{kScoreLowest, true}; + } + else if (nodeX_lag > nodeZ_lag) { + scores[i][1] = ProbaScore{kScoreMax, true}; + if (!latent) + scores[i][0] = ProbaScore{kScoreLowest, true}; + } + + if (nodeZ_lag < nodeY_lag) { + scores[i][2] = ProbaScore{kScoreMax, true}; + if (!latent) + scores[i][3] = ProbaScore{kScoreLowest, true}; + } + else if (nodeZ_lag > nodeY_lag) { + scores[i][3] = ProbaScore{kScoreMax, true}; + if (!latent) + scores[i][2] = ProbaScore{kScoreLowest, true}; + } + } + // Initialize scores of triples involving contextual variables int X = triples[i][0], Z = triples[i][1], Y = triples[i][2]; if (is_contextual[X]) { // X --* Z, X cannot be the child of Z diff --git a/src/proba_orientation.h b/src/proba_orientation.h index 896cbe25..7e544e91 100644 --- a/src/proba_orientation.h +++ b/src/proba_orientation.h @@ -31,7 +31,8 @@ using ScoreArray = std::array; std::vector getOriProbasList(const std::vector&, const std::vector& I3_list, const std::vector& is_contextual, const std::vector& is_consequence, bool latent, bool degenerate, - bool propagation, bool half_v_structure); + bool propagation, bool half_v_structure, + bool temporal, const std::vector& nodes_lags); } // namespace reconstruction } // namespace miic diff --git a/src/r_cpp_interface.cpp b/src/r_cpp_interface.cpp index 710d7c8f..59d3e87d 100644 --- a/src/r_cpp_interface.cpp +++ b/src/r_cpp_interface.cpp @@ -24,6 +24,16 @@ void setEnvironmentFromR(const Rcpp::List& input_data, environment.data_double = Grid2d( n_nodes, n_samples, as>(input_data["double"])); + if (arg_list.containsElementNamed("mode")) + { + auto mode_flag = as(arg_list["mode"]); + if (mode_flag.compare("TS") == 0) + { + environment.mode = 1; + environment.temporal = true; + } + } + if (arg_list.containsElementNamed("n_eff")) environment.n_eff = as(arg_list["n_eff"]); if (environment.n_eff < 0 || environment.n_eff > n_samples) @@ -84,6 +94,8 @@ void setEnvironmentFromR(const Rcpp::List& input_data, } else { environment.is_contextual.resize(n_nodes, 0); } + environment.any_contextual = std::any_of (environment.is_contextual.begin(), + environment.is_contextual.end(), [](bool v) { return v; }); // // Variables considered as consequence only // @@ -141,6 +153,82 @@ void setEnvironmentFromR(const Rcpp::List& input_data, omp_set_num_threads(environment.n_threads); #endif + if (environment.temporal) + { + environment.list_n_layers = as> (arg_list["n_layers"]); + environment.layer_max = *std::max_element ( environment.list_n_layers.begin(), + environment.list_n_layers.end() ); + environment.n_nodes_not_lagged = environment.list_n_layers.size(); + // + // Precompute the class of each variable (unique ID for the part before "_lagX") + // + for (int node_idx = 0; node_idx < environment.n_nodes_not_lagged; ++node_idx) + environment.nodes_class.push_back (node_idx); + for (int layer_idx = 2; layer_idx <= environment.layer_max; ++layer_idx) + for (int node_idx = 0; node_idx < environment.n_nodes_not_lagged; ++node_idx) + if (layer_idx <= environment.list_n_layers[node_idx]) + environment.nodes_class.push_back (node_idx); + // + // Precompute the lag of each variable: (layer - 1) * delta_t + // + vector list_delta_t = as> (arg_list["delta_t"]); + environment.nodes_lags.assign (environment.n_nodes_not_lagged, 0); + for (int layer_idx = 2; layer_idx <= environment.layer_max; ++layer_idx) + for (int node_idx = 0; node_idx < environment.n_nodes_not_lagged; ++node_idx) + if (layer_idx <= environment.list_n_layers[node_idx]) + environment.nodes_lags.push_back ( (layer_idx - 1) * list_delta_t[node_idx]); + // + // For contextual variables, we consider them as very old, + // so they are never the consequence of another variable + // + for (size_t ctx_idx = 0; ctx_idx < environment.is_contextual.size(); ++ctx_idx) + if (environment.is_contextual[ctx_idx]) + environment.nodes_lags[ctx_idx] = INT_MAX; + // + // Pre-compute the index shifts from a var to its next lagged counterpart + // (i.e.: variables: x_lag0, ctr_var, y_lag0, x_lag1, y_lag1 + // => nodes_shifts: 3 , 0 , 2 , 0 , 0) + // + int n_nodes_shifts = environment.n_nodes_not_lagged; + vector end_reached (environment.n_nodes_not_lagged, false); + for (int layer_idx = 2; layer_idx <= environment.layer_max+1; ++layer_idx) + for (int node_idx = 0; node_idx < environment.n_nodes_not_lagged; ++node_idx) + if (layer_idx <= environment.list_n_layers[node_idx]) + environment.nodes_shifts.push_back (n_nodes_shifts); + else if (!end_reached[node_idx]) { + end_reached[node_idx] = true; + environment.nodes_shifts.push_back (0); + --n_nodes_shifts; + } + // + // In temporal mode, we do not start from a complete graph + // => Remove all edges not having a node on the layer 0 + // + for (int i = environment.n_nodes_not_lagged; i < n_nodes; i++) + for (int j = environment.n_nodes_not_lagged; j < n_nodes; j++) { + environment.edges(i, j).status = 0; + environment.edges(i, j).status_init = 0; + environment.edges(i, j).status_prev = 0; + environment.edges(i, j).proba_head = -1; + } + // + // In addition, non lagged variable (i.e.:contextual) can only have edges + // with nodes of the layer 0 (others can be found by stationarity) + // + for (int i = 0; i < environment.n_nodes_not_lagged; i++) + if (environment.is_contextual[i]) + for (int j = environment.n_nodes_not_lagged; j < n_nodes; j++) { + environment.edges(i, j).status = 0; + environment.edges(i, j).status_init = 0; + environment.edges(i, j).status_prev = 0; + environment.edges(i, j).proba_head = -1; + environment.edges(j, i).status = 0; + environment.edges(j, i).status_init = 0; + environment.edges(j, i).status_prev = 0; + environment.edges(j, i).proba_head = -1; + } + } + if (arg_list.containsElementNamed("negative_info")) environment.negative_info = as(arg_list["negative_info"]); diff --git a/src/reconstruct.cpp b/src/reconstruct.cpp index b74ba960..fd1f9a71 100644 --- a/src/reconstruct.cpp +++ b/src/reconstruct.cpp @@ -14,6 +14,7 @@ #include "r_cpp_interface.h" #include "skeleton.h" #include "utilities.h" +#include "tmiic.h" using Rcpp::_; using Rcpp::as; @@ -61,7 +62,15 @@ List reconstruct(List input_data, List arg_list) { int iter_count{0}; bool is_consistent{false}; do { - if (environment.consistent != 0) bcc.analyse(); + if (environment.consistent != 0) { + // In temporal stationary mode, duplicate temporarily edges over history + // for the consistency assessment + if (environment.mode == 1) + tmiic::repeatEdgesOverHistory (environment); + bcc.analyse(); + if (environment.mode == 1) + tmiic::dropPastEdges (environment); + } // Store current status in status_prev and revert to the structure at the // moment of initialization for (int i = 0; i < environment.n_nodes; i++) { @@ -98,7 +107,16 @@ List reconstruct(List input_data, List arg_list) { if (environment.orientation && !environment.connected_list.empty()) { lap_start = getLapStartTime(); Rcout << "Search for edge directions...\n"; + // + // In temporal stationary mode, when latent variable discovery is activated, + // we temporarily duplicate edges over history assuming stationarity to + // correctly identify the possible unshielded triples for orientation + // + if ( (environment.mode == 1) && (environment.latent_orientation) ) + tmiic::repeatEdgesOverHistory (environment); orientations = orientationProbability(environment); + if ( (environment.mode == 1) && (environment.latent_orientation) ) + tmiic::dropPastEdges (environment); environment.exec_time.ori += getLapInterval(lap_start); } if (environment.consistent != 0) diff --git a/src/tmiic.cpp b/src/tmiic.cpp new file mode 100644 index 00000000..538ad2b8 --- /dev/null +++ b/src/tmiic.cpp @@ -0,0 +1,314 @@ +//***************************************************************************** +// Filename : tmiic.cpp Creation date: 07 may 2020 +// +// Author : Franck SIMON +// +// Description: Store functions for temporal mode of miic (tmiic) +//***************************************************************************** + +//============================================================================= +// INCLUDE SECTION +//============================================================================= +#include "orientation.h" +#include "tmiic.h" + +//============================================================================= +// CONSTANTS +//============================================================================= + +//============================================================================= +// NAMESPACES +//============================================================================= +namespace tmiic { + +using std::string; +using namespace miic::structure; +using namespace miic::utility; + +//----------------------------------------------------------------------------- +// getListLaggedEdges +//----------------------------------------------------------------------------- +// Description: in temporal mode, find the past lagged counterparts of an edge +// assuming stationarity +// +// Params: +// - Environment&: the environment structure +// - int: edge's first node index +// - int: edge's second node index +// Return: +// - std::vector< std::pair, > : list of lagged edges +//-------------------------------------------------------------------------------- +std::vector< std::pair > getListLaggedEdges + (Environment& environment, int node1_pos, int node2_pos) { + std::vector< std::pair > list_ret; + if ( (node1_pos >= environment.n_nodes_not_lagged) + && (node2_pos >= environment.n_nodes_not_lagged) ) + // + // The edge is duplicated, this function deals only with the original ones + // + return (list_ret); + // + // If one of the variables is not lagged, the lag is not constant + // + int sav_lag = environment.nodes_lags[node1_pos] - environment.nodes_lags[node2_pos]; + bool same_lag_needed = true; + if ( (node1_pos < environment.n_nodes_not_lagged) + && (environment.list_n_layers[node1_pos] <= 1) ) + same_lag_needed = false; + else if ( (node2_pos < environment.n_nodes_not_lagged) + && (environment.list_n_layers[node2_pos] <= 1) ) + same_lag_needed = false; + // + // Look for the same edge lagged over all layers of history + // + while (true) { + // + // We shift the nodes positions using pre-computed vector nodes_shifts + // + int node1_shift = environment.nodes_shifts[node1_pos]; + int node2_shift = environment.nodes_shifts[node2_pos]; + if ( (node1_shift <= 0) && (node2_shift <= 0) ) + break; + node1_pos += node1_shift; + node2_pos += node2_shift; + // + // Ensure if both variable are lagged than we keep the same lag when duplicating + // + bool same_lag_impossible = false; + if (same_lag_needed) { + int new_lag = environment.nodes_lags[node1_pos] - environment.nodes_lags[node2_pos]; + while (sav_lag != new_lag) { + if (sav_lag < new_lag) { + int node2_shift = environment.nodes_shifts[node2_pos]; + if (node2_shift <= 0) { + same_lag_impossible = true; + break; + } + node2_pos += node2_shift; + } + else { // sav_lag > new_lag + int node1_shift = environment.nodes_shifts[node1_pos]; + if (node1_shift <= 0) { + same_lag_impossible = true; + break; + } + node1_pos += node1_shift; + } + new_lag = environment.nodes_lags[node1_pos] - environment.nodes_lags[node2_pos]; + } + } + if (same_lag_impossible) + break; + // + // A lagged edge has been found + // + list_ret.push_back (std::make_pair (node1_pos, node2_pos) ); + } + return (list_ret); +} + +//----------------------------------------------------------------------------- +// repeatEdgesOverHistory +//----------------------------------------------------------------------------- +// Description: Duplicates edges over the temporal graph +// +// Detail: For consistency and the orientation step when latent variable +// discovery is enabled, we duplicate the edges over the history, assuming +// stationarity, to improve the consistency assessment and the orientations +// +// Params: +// - Environment&: the environment structure +//----------------------------------------------------------------------------- +void repeatEdgesOverHistory (Environment& environment) { + // + // We iterate over computed edges to duplicate (if needed) + // the edges over the history + // + auto& edge_list = environment.connected_list; + std::vector::size_type size = edge_list.size(); + for (std::vector::size_type i = 0; i < size; ++i) { + const Edge& edge_orig = environment.edges (edge_list[i].X, edge_list[i].Y); + std::vector< std::pair > list_lagged = getListLaggedEdges + (environment, edge_list[i].X, edge_list[i].Y); + for (auto const& it_lagged : list_lagged) { + // + // Duplicate the edge info into environment.edges array + // + Edge& edge_to_modif = environment.edges (it_lagged.first, it_lagged.second); + edge_to_modif.status = edge_orig.status; + edge_to_modif.status_init = edge_orig.status_init; + edge_to_modif.status_prev = edge_orig.status_prev; + edge_to_modif.proba_head = edge_orig.proba_head; + + Edge& edge_to_modif_inverse = environment.edges (it_lagged.second, it_lagged.first); + edge_to_modif_inverse.status = edge_orig.status; + edge_to_modif_inverse.status_init = edge_orig.status_init; + edge_to_modif_inverse.status_prev = edge_orig.status_prev; + edge_to_modif_inverse.proba_head = edge_orig.proba_head; + // + // Add the nodes into connected_list + // + environment.connected_list.emplace_back (it_lagged.first, it_lagged.second, + environment.edges (it_lagged.first, it_lagged.second) ); + } + } +} + +//----------------------------------------------------------------------------- +// completeOrientationUsingTime +//----------------------------------------------------------------------------- +// Description: Complete the orientations with the orientation of temporal +// edges that were not previously oriented +// +// Detail: completeOrientationUsingTime will look in the list of connected +// edges for the ones that have not been oriented using the unshielded triples, +// that are lagged and have a node on the layer 0. +// Edges matching these criteria will be oriented using time from the oldest +// node to the newest. +// +// N.B.: Edges not having a node on the layer 0 are edges "past only" and are a +// consequence of duplicating edges over history to maximize the number of +// unshielded triples, There is no interest to orient "past only" edges +// using time as they will be removed at the end of the orientation step. +// +// Params: +// - Environment&: the environment structure +// - std::vector& : list of unshielded triples +//-------------------------------------------------------------------------------- +void completeOrientationUsingTime (Environment& environment, + const std::vector& triples) + { + // Tail probability to use for lagged edges differs if latent variables are + // authorized or not: + // - 0 if no latent var, we are sure that the oldest node is the cause + // - 0.5 with latent var as we can not be sure that the oldest node is the cause + // + double tail_proba = 0; + if (environment.latent_orientation) + tail_proba = 0.5; + // + // Loop over edges to find edges that were not considered when orienting with + // open triples but can be oriented using time + // + const auto& edge_list = environment.connected_list; + for (auto iter0 = begin(edge_list); iter0 != end(edge_list); ++iter0) + { + int posX = iter0->X, posY = iter0->Y; + // + // If edges has no node on the layer 0, it is a duplicated one => skip it + // + if ( ! ( (posX < environment.n_nodes_not_lagged) + || (posY < environment.n_nodes_not_lagged) ) ) + continue; + // + // If the edge is contemporaneous, no information from time => skip it + // + if (environment.nodes_lags[posX] == environment.nodes_lags[posY]) + continue; + // + // If edge is in triple, head/tail probas have already been computed + // and applied in adjacency matrix + // + bool is_in_triple = false; + for (unsigned int i = 0; i < triples.size() ; i++) + if ( ( (triples[i][0] == posX) && (triples[i][1] == posY) ) + || ( (triples[i][0] == posY) && (triples[i][1] == posX) ) + || ( (triples[i][1] == posX) && (triples[i][2] == posY) ) + || ( (triples[i][1] == posY) && (triples[i][2] == posX) ) ) + { + is_in_triple = true; + break; + } + if (is_in_triple) + continue; + // + // The edge was not in open triples, has a node on layer 0, and is not + // contemporaneous => we can and need to orient it using time + // As time goes from past to present: edge orientation is max lag -> min lag + // + if (environment.nodes_lags[posX] > environment.nodes_lags[posY]) + { + if (environment.is_contextual[posX]) + miic::reconstruction::updateAdj(environment, posX, posY, 0, 1); + else + miic::reconstruction::updateAdj(environment, posX, posY, tail_proba, 1); + } + else + { + if (environment.is_contextual[posY]) + miic::reconstruction::updateAdj(environment, posX, posY, 1, 0); + else + miic::reconstruction::updateAdj(environment, posX, posY, 1, tail_proba); + } + } + } + +//----------------------------------------------------------------------------- +// dropPastEdges +//----------------------------------------------------------------------------- +// Description: +// Drop past edges (the edges having no node of the final time step) +// +// Detail: for consistency assessment or orientation step with latent variable +// discovery is enabled, we duplicated edges over history. Here, we ensure that +// the edges are restored to their state prior before duplication. +// +// Params: +// - Environment&: the environment structure +//-------------------------------------------------------------------------------- +void dropPastEdges (Environment& environment) { + // + // We iterate over computed edges to find edges previously duplicated using stationary. + // All the edges duplicated are disconnected and removed from environment.connected_list + // + auto it = begin(environment.connected_list); + while ( it != end(environment.connected_list) ) { + // + // When the two nodes are not on the layer 0, the edge is removed + // + if ( (it->X >= environment.n_nodes_not_lagged) + && (it->Y >= environment.n_nodes_not_lagged) ) + it = environment.connected_list.erase (it); + // + // When one of the nodes is not lagged (i.e. contextual) + // and the other not on the layer 0, the edge is removed + // + else if ( (it->X < environment.n_nodes_not_lagged) + && (environment.list_n_layers[it->X] <= 1) + && (it->Y >= environment.n_nodes_not_lagged) ) + it = environment.connected_list.erase (it); + else if ( (it->Y < environment.n_nodes_not_lagged) + && (environment.list_n_layers[it->Y] <= 1) + && (it->X >= environment.n_nodes_not_lagged) ) + it = environment.connected_list.erase (it); + else + it++; + } + // + // We remove from the edges all those having pos > n_nodes_not_lagged + // + for (int node1_pos = environment.n_nodes_not_lagged; node1_pos < environment.n_nodes; ++node1_pos) + for (int node2_pos = environment.n_nodes_not_lagged; node2_pos < environment.n_nodes; ++node2_pos) { + environment.edges(node1_pos,node2_pos).status = 0; + environment.edges(node1_pos,node2_pos).status_init = 0; + environment.edges(node1_pos,node2_pos).status_prev = 0; + environment.edges(node1_pos,node2_pos).proba_head = -1; + } + // + // In addition, remove lagged edges added on non lagged variable (i.e.:contextual) + // + for (int i = 0; i < environment.n_nodes_not_lagged; i++) + if (environment.is_contextual[i]) + for (int j = environment.n_nodes_not_lagged; j < environment.n_nodes; j++) { + environment.edges(i, j).status = 0; + environment.edges(i, j).status_init = 0; + environment.edges(i, j).status_prev = 0; + environment.edges(i, j).proba_head = -1; + environment.edges(j, i).status = 0; + environment.edges(j, i).status_init = 0; + environment.edges(j, i).status_prev = 0; + environment.edges(j, i).proba_head = -1; + } +} + +} // namespace tmiic diff --git a/src/tmiic.h b/src/tmiic.h new file mode 100644 index 00000000..7b4886d5 --- /dev/null +++ b/src/tmiic.h @@ -0,0 +1,29 @@ +//****************************************************************************** +// Filename : tmiic.h Creation date: 07 may 2020 +// +// Description: header file of tmiic (temporal miic) +// +// Author : Franck SIMON +//****************************************************************************** +#ifndef TMIIC_ +#define TMIIC_ + +#include + +#include "environment.h" + + +namespace tmiic { +// An unshielded Triple (X, Z, Y): +using Triple = std::array; + +std::vector< std::pair > getListLaggedEdges + (miic::structure::Environment&, int, int); +void repeatEdgesOverHistory (miic::structure::Environment&); +void completeOrientationUsingTime (miic::structure::Environment&, + const std::vector&); +void dropPastEdges (miic::structure::Environment&); + +} // namespace tmiic + +#endif // TMIIC_