-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper_match_evaluate_multiple.R
119 lines (93 loc) · 4.99 KB
/
helper_match_evaluate_multiple.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#########################################################################################
# Function to match cluster labels with manual gating (reference standard) population
# labels and calculate precision, recall, and F1 score
#
# Matching criterion: Hungarian algorithm
#
# Use this function for data sets with multiple populations of interest
#
# Lukas Weber, August 2016
#########################################################################################
library(clue)
# arguments:
# - clus_algorithm: cluster labels from algorithm
# - clus_truth: true cluster labels
# (for both arguments: length = number of cells; names = cluster labels (integers))
helper_match_evaluate_multiple <- function(clus_algorithm, clus_truth) {
# number of detected clusters
n_clus <- length(table(clus_algorithm))
# remove unassigned cells (NA's in clus_truth)
unassigned <- is.na(clus_truth)
clus_algorithm <- clus_algorithm[!unassigned]
clus_truth <- clus_truth[!unassigned]
if (length(clus_algorithm) != length(clus_truth)) warning("vector lengths are not equal")
tbl_algorithm <- table(clus_algorithm)
tbl_truth <- table(clus_truth)
# detected clusters in rows, true populations in columns
pr_mat <- re_mat <- F1_mat <- matrix(NA, nrow = length(tbl_algorithm), ncol = length(tbl_truth))
for (i in 1:length(tbl_algorithm)) {
for (j in 1:length(tbl_truth)) {
i_int <- as.integer(names(tbl_algorithm))[i] # cluster number from algorithm
j_int <- as.integer(names(tbl_truth))[j] # cluster number from true labels
true_positives <- sum(clus_algorithm == i_int & clus_truth == j_int, na.rm = TRUE)
detected <- sum(clus_algorithm == i_int, na.rm = TRUE)
truth <- sum(clus_truth == j_int, na.rm = TRUE)
# calculate precision, recall, and F1 score
precision_ij <- true_positives / detected
recall_ij <- true_positives / truth
F1_ij <- 2 * (precision_ij * recall_ij) / (precision_ij + recall_ij)
if (F1_ij == "NaN") F1_ij <- 0
pr_mat[i, j] <- precision_ij
re_mat[i, j] <- recall_ij
F1_mat[i, j] <- F1_ij
}
}
# put back cluster labels (note some row names may be missing due to removal of unassigned cells)
rownames(pr_mat) <- rownames(re_mat) <- rownames(F1_mat) <- names(tbl_algorithm)
colnames(pr_mat) <- colnames(re_mat) <- colnames(F1_mat) <- names(tbl_truth)
# match labels using Hungarian algorithm applied to matrix of F1 scores (Hungarian
# algorithm calculates an optimal one-to-one assignment)
# use transpose matrix (Hungarian algorithm assumes n_rows <= n_cols)
F1_mat_trans <- t(F1_mat)
if (nrow(F1_mat_trans) <= ncol(F1_mat_trans)) {
# if fewer (or equal no.) true populations than detected clusters, can match all true populations
labels_matched <- clue::solve_LSAP(F1_mat_trans, maximum = TRUE)
# use row and column names since some labels may have been removed due to unassigned cells
labels_matched <- as.numeric(colnames(F1_mat_trans)[as.numeric(labels_matched)])
names(labels_matched) <- rownames(F1_mat_trans)
} else {
# if fewer detected clusters than true populations, use transpose matrix and assign
# NAs for true populations without any matching clusters
labels_matched_flipped <- clue::solve_LSAP(F1_mat, maximum = TRUE)
# use row and column names since some labels may have been removed due to unassigned cells
labels_matched_flipped <- as.numeric(rownames(F1_mat_trans)[as.numeric(labels_matched_flipped)])
names(labels_matched_flipped) <- rownames(F1_mat)
labels_matched <- rep(NA, ncol(F1_mat))
names(labels_matched) <- rownames(F1_mat_trans)
labels_matched[as.character(labels_matched_flipped)] <- as.numeric(names(labels_matched_flipped))
}
# precision, recall, F1 score, and number of cells for each matched cluster
pr <- re <- F1 <- n_cells_matched <- rep(NA, ncol(F1_mat))
names(pr) <- names(re) <- names(F1) <- names(n_cells_matched) <- names(labels_matched)
for (i in 1:ncol(F1_mat)) {
# set to 0 if no matching cluster (too few detected clusters); use character names
# for row and column indices in case subsampling completely removes some clusters
pr[i] <- ifelse(is.na(labels_matched[i]), 0, pr_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
re[i] <- ifelse(is.na(labels_matched[i]), 0, re_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
F1[i] <- ifelse(is.na(labels_matched[i]), 0, F1_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
n_cells_matched[i] <- sum(clus_algorithm == labels_matched[i], na.rm = TRUE)
}
# means across populations
mean_pr <- mean(pr)
mean_re <- mean(re)
mean_F1 <- mean(F1)
return(list(n_clus = n_clus,
pr = pr,
re = re,
F1 = F1,
labels_matched = labels_matched,
n_cells_matched = n_cells_matched,
mean_pr = mean_pr,
mean_re = mean_re,
mean_F1 = mean_F1))
}