-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze-overlap.r
executable file
·99 lines (83 loc) · 3.79 KB
/
analyze-overlap.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/Rscript --no-save --no-restore --no-init-file --no-environ
install_and_load <- function(pkg) {
if (!pkg %in% installed.packages()[,"Package"]) {
install.packages(pkg, repos = "https://cloud.r-project.org/")
}
if (!library(pkg, character.only = TRUE, warn.conflicts = FALSE, logical.return = TRUE)) {
cat("\nFailed to load a required package:", pkg, "\n")
cat("Perhaps consider installing it manually or update your R installation to the most recent version.\n\n")
q(status = 1)
}
}
install_and_load("circlize")
install_and_load("dplyr")
misp_data <- read.table("anonymized-overlap-data-2015-07--2016-06.csv",
sep="\t",
na.strings = "\\N",
header = F,
col.names = c("source1", "source2", "ip"),
colClasses = c("factor", "factor", "character")
)
misp_linkage <- misp_data %>%
filter(!is.na(source2)) %>%
group_by(source1, source2) %>%
summarize(count=n())
SMALL_SOURCE_THRESHOLD = 1000 # Adjust this value, set to zero to display all
small_sources <- (misp_linkage %>%
group_by(source2) %>%
summarize(total_count=sum(count)) %>%
filter(total_count < SMALL_SOURCE_THRESHOLD)
)$source2
misp_linkage_simplified <- misp_linkage %>%
mutate(source2_simplified = ifelse(as.character(source2) %in% small_sources, "OTHER", as.character(source2))) %>%
group_by(source1, source2_simplified) %>%
summarize(count=sum(count))
misp_colors_labels <- as.vector(unique(misp_linkage_simplified$source2_simplified))
misp_colors <- rep("#e78ac3", length(misp_colors_labels))
names(misp_colors) <- misp_colors_labels
misp_colors["circl-lu.misp"] <- "#66c2a5"
misp_colors["a.misp"] <- "#8da0cb"
misp_colors["b.misp"] <- "#fc8d62"
OUTPUT_PLOT_FILE <- "output-diagram.pdf"
cat("Saving output diagram to:", OUTPUT_PLOT_FILE, "\n")
cairo_pdf(filename = OUTPUT_PLOT_FILE, width = 8, height = 8)
circos.clear()
circos.par(start.degree = 280, gap.degree = 3, track.margin = c(-0.1, 0.1))
chordDiagram(misp_linkage_simplified,
grid.col = misp_colors,
annotationTrack = "grid",
preAllocateTracks=list(track.height=0.5),
directional = 1,
direction.type = c("arrows", "diffHeight"),
diffHeight = -0.02,
link.arr.type = "big.arrow",
link.sort = TRUE,
link.largest.ontop = TRUE,
transparency = 0.3
)
circos.trackPlotRegion(track.index=1, panel.fun=function(x,y){
xlim=get.cell.meta.data("xlim")
ylim=get.cell.meta.data("ylim")
sector.name=get.cell.meta.data("sector.index")
circos.text(mean(xlim), 0, sector.name, facing="clockwise", niceFacing=T, adj=c(0,0.5))
}, bg.border=NA)
misp_stats_counts <- misp_data %>% group_by(source1) %>% summarize(n_ip = n_distinct(ip), uniq_to_source = sum(is.na(source2))) %>% mutate(overlap_ratio = 1- uniq_to_source / n_ip)
misp_stats_overlap_with_other <- misp_data %>%
filter(!source2 %in% c("a.misp", "b.misp", "circl-lu.misp") & !is.na(source2)) %>%
group_by(source1) %>%
summarize(n_in_other = n_distinct(ip))
misp_stats_overlap_with_misps <- misp_data %>%
filter(source2 %in% c("a.misp", "b.misp", "circl-lu.misp")) %>%
group_by(source1) %>%
summarize(n_in_misps = n_distinct(ip))
misp_stats_joined <-
full_join(misp_stats_counts, misp_stats_overlap_with_other, by = "source1") %>%
mutate(ratio_other = n_in_other / n_ip) %>%
full_join(misp_stats_overlap_with_misps, by = "source1") %>%
mutate(ratio_misps = n_in_misps / n_ip) %>%
select(source1, n_ip, overlap_ratio, ratio_misps, ratio_other)
OUTPUT_CSV_FILE <- "output-stats.csv"
cat("Saving overlap statistics to:", OUTPUT_CSV_FILE, "\n")
write.table(misp_stats_joined, file = OUTPUT_CSV_FILE, quote = FALSE,
sep = "\t", row.names = FALSE
)