-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path3_combine_ROH_and_fitness_data.R
174 lines (150 loc) · 6.2 KB
/
3_combine_ROH_and_fitness_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# another data processing script:
# combine annual fitness measures and ROH based inbreeding coefficients FROH
# Output: 1) data.frame for modeling
# 2) ordered pedigree
library(tidyverse)
library(MasterBayes)
library(data.table)
library(rlang)
library(snpStats)
# annual measures of traits and fitness
fitness_path <- "data/1_Annual_Fitness_Measures_April_20190501.txt"
annual_fitness <- read_delim(fitness_path, delim = "\t")
names(annual_fitness)
# prepare, order pedigree
sheep_ped <- read_delim("../sheep/data/SNP_chip/20190711_Soay_Pedigree.txt",
delim = "\t",
col_types = "ccc") %>%
as.data.frame() %>%
MasterBayes::orderPed()
#save(sheep_ped, file = "model_in/sheep_ped.RData")
# check call rate
# no individual has less than 95 %
sheep_plink_name <- "data/sheep_geno_imputed_oar_filt"
# read merged plink data
sheep_bed <- paste0(sheep_plink_name, ".bed")
sheep_bim <- paste0(sheep_plink_name, ".bim")
sheep_fam <- paste0(sheep_plink_name, ".fam")
full_sample <- read.plink(sheep_bed, sheep_bim, sheep_fam)
summary(full_sample$genotypes)
summary(col.summary(full_sample$genotypes))
# IDs which weren't well imputed and should be removed
#IDs_lots_missing <- read_delim("data/ids_more_than_5perc_missing.txt", delim = " ")
##### ROH data #####
#file_path <- "output/ROH/roh_nofilt_ram_pruned.hom"
# use unpruned data
file_path <- "output/ROH/roh.hom"
#file <- "roh_nofilt_pruned"
roh_lengths <- fread(file_path)
hist(roh_lengths$KB, breaks = 1000, xlim = c(0,10000))
# Chr lengths
chr_data <- read_delim("../sheep/data/sheep_genome/chromosome_info_oar31.txt", delim = "\t") %>%
rename(size_BP = Length,
CHR = Part) %>%
mutate(size_KB = size_BP / 1000)
autosomal_genome_size <- chr_data %>%
.[2:27, ] %>%
summarise(sum_KB = sum(size_KB)) %>%
as.numeric()
# define ROH length classes
calc_froh_classes <- function(roh_crit, roh_lengths) {
roh_filt <- dplyr::case_when(
roh_crit == "short" ~ expr(KB < 1221),
roh_crit == "medium" ~ expr((KB > 1221)&(KB < 4885)),
roh_crit == "long" ~ expr(KB > 4885),
roh_crit == "all" ~ expr(KB > 0)
)
roh_lengths %>%
dplyr::group_by(IID) %>%
#filter({{ roh_filt }}) %>%
filter(!!roh_filt) %>%
dplyr::summarise(KBSUM = sum(KB)) %>%
mutate(FROH = KBSUM / autosomal_genome_size) %>%
dplyr::select(IID, FROH) %>%
rename(ID = IID, !! paste0("FROH_", roh_crit) := FROH)
}
# proportion of ROH length classes in each genome. Individuals which
# do not have long ROH have 0 for this class.
ROH_classes <- c("short", "medium", "long", "all")
froh <- purrr::map(ROH_classes, calc_froh_classes, roh_lengths) %>%
purrr::reduce(left_join, by = "ID") %>%
replace_na(list(FROH_long = 0))
# add FROH minus each of the chromosomes as new variabels for gwas
calc_froh_minus_chr <- function(chr, roh_lengths) {
# get chromosome length and substract from autosomal genome length
chr_size <- chr_data %>%
filter(CHR == paste0("Chromosome ", chr)) %>%
.[["size_KB"]]
genome_size_minus_chr <- autosomal_genome_size - chr_size
roh_lengths %>%
as_tibble() %>%
dplyr::group_by(IID) %>%
#filter({{ roh_filt }}) %>%
dplyr::filter(CHR != !!chr) %>%
dplyr::summarise(KBSUM = sum(KB)) %>%
mutate(FROH = KBSUM / genome_size_minus_chr ) %>%
dplyr::select(IID, FROH) %>%
rename(ID = IID, !! paste0("froh_no_chr", chr) := FROH)
}
froh_no_chr <- map(1:26, calc_froh_minus_chr, roh_lengths) %>%
reduce(left_join, by = "ID")
# add to froh
froh <- froh %>%
left_join(froh_no_chr, by = "ID")
# add froh to fitness
fitness_data <- annual_fitness %>%
left_join(froh, by = "ID")
# add homozygosity not in roh
#homs <- read_delim("output/ROH/roh_nofilt_hom_not_in_roh.txt", delim = " ")
#fitness_data <- fitness_data %>%
# left_join(homs, by = "ID")
# prepare for analysis
# might have to be modified if not all variables present in
# fitness data file
fitness_data <- fitness_data %>%
dplyr::rename(birth_year = BIRTHYEAR,
sheep_year = SheepYear,
age = Age,
id = ID,
twin = TWIN,
sex = SEX,
mum_id = MOTHER,
froh_short = FROH_short,
froh_medium = FROH_medium,
froh_long = FROH_long,
froh_all = FROH_all,
# froh_not_roh = hom,
survival = Survival,
comment = Comment,
seen_in_rut = SeenInRut,
dad_id = FATHER,
weight = Weight,
birth_weight = BIRTHWT,
cap_month = CapMonth,
hindleg = Hindleg,
foreleg = Foreleg,
rut_measure = RutMeasure,
horn_len = HornLen,
horn_circ = HornCirc,
bol_circ = BolCirc,
bol_len = BolLen,
horn = Horn,
freq = Freq,
offspr_born = OffspringBorn,
offspr_surv = OffspringSurvived,
offspr_surv_aug = AugSurvOffspring,
offspr_surv_oct = OctSurvOffspring,
offspr_surv_wint = OverWinterOffspring) %>%
dplyr::select(id, survival, comment, sheep_year, age, birth_year,
sex, mum_id, twin, froh_all, froh_long, froh_medium,
froh_all, everything()) %>%
# for sex, check what Cas is
filter(sex %in% c("F", "M")) %>%
# some individuals arent imputed well and should be discarded
#filter(!(id %in% IDs_lots_missing$id)) %>%
# na rows should be discarded
mutate_at(c("id", "sheep_year", "birth_year", "sex",
"mum_id", "twin"), as.factor)
# save data
save(fitness_data, file = "data/survival_mods_data.RData") # formerly fitness_roh_df
save(sheep_ped, file = "data/sheep_ped.RData") # ordered ped