-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathL2_functions.R
210 lines (177 loc) · 7.37 KB
/
L2_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
create_dist_table <- function(data){
distributions <- c("norm", "exp", "gamma", "lnorm", "weibull", "beta",
"cauchy", "logis", "t", "unif", "pareto", "chisq", "f")
fit_distributions <- function(data, distributions) {
fits <- list()
# Check for degenerate distribution
if (length(unique(data)) == 1) {
message("Data is degenerate (all values are the same). Returning degenerate fits.")
for (dist in distributions) {
fits[[dist]] <- list(
estimate = unique(data),
loglik = -Inf,
warning = "Degenerate distribution, cannot fit."
)
}
return(fits)
}
# Check for near-degenerate data (e.g., most values are the same)
most_frequent_value <- names(sort(table(data), decreasing = TRUE))[1]
proportion_most_frequent <- max(table(data)) / length(data)
if (proportion_most_frequent > 0.95) { # Threshold: 95% of data has the same value
message(sprintf("Data is near-degenerate (%.2f%% of values are '%s'). No distributions will be fitted.",
proportion_most_frequent * 100, most_frequent_value))
for (dist in distributions) {
fits[[dist]] <- list(
estimate = unique(data),
loglik = -Inf
)
return(NULL)
}
}
browser()
for (dist in distributions) {
message(sprintf("Fitting %s distribution...", dist))
capture.output(
tryCatch({
if (dist == "beta") {
# Check if data needs rescaling for beta distribution
if (min(data) < 0 || max(data) > 1) {
message("Rescaling data to [0, 1] for beta distribution.")
data_rescaled <- (data - min(data)) / (max(data) - min(data))
fits[[dist]] <- fitdist(data_rescaled, dist,
start = list(shape1 = 1, shape2 = 1),
control = list(maxit = 1000))
} else {
fits[[dist]] <- fitdist(data, dist)
}
} else if (dist == "t" | dist == "f" | dist == "chisq") {
# Provide starting value for degrees of freedom (df) for t-distribution
fits[[dist]] <- fitdist(data, dist, start = list(df = 10))
} else {
# Fit other distributions with original data
fits[[dist]] <- fitdist(data, dist)
}
}, error = function(e) {
message(sprintf("Failed to fit %s: %s", dist, e$message))
})
)
}
return(fits)
}
#
fits <- fit_distributions(data, distributions)
if (!is.null(fits)) {
# Initialize an empty list to collect results
gof_results <- list()
for (name in names(fits)) {
tryCatch({
# Calculate gofstats for each fit
message(sprintf("Calculating goodness-of-fit for %s...", name))
gof_results[[name]] <- gofstat(fits[[name]])
}, error = function(e) {
message(sprintf("Failed to calculate goodness-of-fit for %s: %s", name, e$message))
# Assign a placeholder (e.g., NULL or NA) for failed fits
gof_results[[name]] <- list(
chisq = NA,
ad = NA,
bic = NA,
message = "Goodness-of-fit failed"
)
})
}
# Combine results into a consistent structure (if needed)
gof <- do.call(rbind, lapply(names(gof_results), function(name) {
cbind(distribution = name, gof_results[[name]])
}))
} else {
gof <- NULL
}
get_top_distributions <- function(gof_results, fits, metric = "chisqpvalue", significance_levels = c(0.10, 0.05, 0.01)) {
# Check if the chosen metric exists in the gof_results
# if (!metric %in% names(gof_results)) {
# stop(paste("Metric", metric, "is not found in the goodness-of-fit results."))
# }
#
dist_names <- names(gof_results)
fit_stats <- list()
for (name in names(gof_results)){
# Extract the selected metric
fit_stats[name] <- gof_results[[name]][[metric]]
# fit_stats$Internal_Name <- n # Add internal names as a column
}
# Map back to the original names using the order of `fits`
original_names <- names(fit_stats)
# fit_stats$Name <- sapply(seq_along(original_names), function(i) {
# if (i <= nrow(fit_stats)) {
# return(original_names[i])
# } else {
# return(NA)
# }
# })
fit_stats <- stack(fit_stats)
# Rename columns for clarity
colnames(fit_stats) <- c("P_value", "Distribution")
# Sort by the chosen metric (lower is better)
fit_stats <- fit_stats[order(fit_stats$P_value, decreasing = TRUE), ]
# Select the top 3 distributions
top_distributions <- head(fit_stats, 3)
# Add asterisks for significance levels
top_distributions$Significance <- sapply(top_distributions$P_value, function(value) {
if (value <= significance_levels[3]) {
return("***") # 1% significance
} else if (value <= significance_levels[2]) {
return("**") # 5% significance
} else if (value <= significance_levels[1]) {
return("*") # 10% significance
} else {
return("") # No asterisk
}
})
# Reorganize columns for better readability
top_distributions <- top_distributions[, c("Distribution", "P_value", "Significance")]
rownames(top_distributions) <- NULL
colnames(top_distributions)[2] <- "P_value"
colnames(top_distributions)[3] <- "Test.Against.Dist"
return(top_distributions)
}
results_table <- get_top_distributions(gof_results, fits)
return(results_table)
}
generate_vertical_mini_histogram <- function(data, bins = 10, max_height = 7, symbol = "■") {
# Create histogram breaks
hist_res <- hist(data, breaks = bins, plot = FALSE)
# Scale frequencies to fit the max_height
scaled_freqs <- round(hist_res$counts / max(hist_res$counts) * max_height)
# Generate the vertical histogram as a character vector
lines <- lapply(max_height:1, function(level) {
sapply(scaled_freqs, function(x) if (x >= level) symbol else " ")
})
# Combine the histogram into a single structure
histogram <- sapply(lines, paste, collapse = " ")
# Return the histogram as a structured character vector
return(histogram)
}
combine_table_and_histogram <- function(table, histogram, colname) {
# Convert the table to printable character rows with proper spacing
table_rows <- apply(table, 1, function(row) sprintf("%-13s | %-15s | %-15s", row[1], row[2], row[3]))
# Ensure the histogram has the same length as the table rows
if (length(histogram) < nrow(table)) {
histogram <- c(histogram, rep("", nrow(table) - length(histogram)))
}
# Combine the table with its corresponding histogram row by row
combined_output <- mapply(function(table_row, hist_row) {
paste(table_row, "|", hist_row)
}, c(table_rows,
rep(sprintf("%-10s | %-20s | %-15s", "" , "", ""),length(histogram) - length(table_rows))), histogram)
# Prepare the header for the table
header <- sprintf("%-13s | %-15s | %-15s | Histogram", "Distribution", "p_value", "Test.Against.Dist")
separator <- paste(rep("-", nchar(header)), collapse = "")
# cli::cli_h1(cli::blue("{.Column: {self$data_name}}"))
# Print the header and separator
cat(colname, "\n")
cat(header, "\n")
cat(separator, "\n")
# Print each combined row
cat(combined_output, sep = "\n")
}