diff --git a/R/gen_tibble.R b/R/gen_tibble.R
index 70060614..08695e22 100644
--- a/R/gen_tibble.R
+++ b/R/gen_tibble.R
@@ -3,13 +3,15 @@
 #' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
 #' here the format
 #'
-#' - *VCF* files: the fast `cpp` parser is used by default. It attempts to establish
-#' ploidy from the first variant; if that variant is found in a sex chromosome (or mtDNA),
-#' it will fail. To use the fast parser, change the order of variants so that the first chromosome is an
-#' autosome using a tool such as `vcftools`. Alternatively, the slower but more robust
-#' `vcfR` parser should be used instead. Currently, only biallelic SNPs are supported. If haploid
-#' variants (e.g. sex chromosomes) are included in the vcf, they are not transformed into homozygous calls.
-#' Instad, reference alleles will be counted as 0 and alternative alleles will be counted as 1.
+#'- *VCF* files: the fast `cpp` parser is used by default. Both `cpp` and `vcfR` parsers
+#' attempt to establish ploidy from the first variant; if that variant is found in a
+#' sex chromosome (or mtDNA), the parser will fail with 'Error: a genotype has more
+#' than max_ploidy alleles...'. To successful import such a *VCF*, change the order of variants
+#' so that the first chromosome is an autosome using a tool such as `vcftools`.
+#' Currently, only biallelic SNPs are supported. If haploid variants (e.g. sex
+#' chromosomes) are included in the *VCF*, they are not transformed into homozygous
+#' calls. Instead, reference alleles will be counted as 0 and alternative alleles
+#' will be counted as 1.
 #'
 #' - *packedancestry* files: When loading *packedancestry* files, missing alleles will be converted from
 #' 'X' to NA
@@ -21,9 +23,9 @@
 #' - a string giving the path to a RDS file storing a `bigSNP` object from
 #' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
 #' - a string giving the path to a vcf file. Note that we currently read the whole
-#' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
+#' vcf in memory with `vcfR`, so only smallish *VCF* can be imported. Only biallelic
 #' SNPs will be considered.
-#' - a string giving the path to a packedancestry .geno file. The associated
+#' - a string giving the path to a *packedancestry* .geno file. The associated
 #' .ind and .snp files are expected to be in the same directory and share the
 #' same file name prefix.
 #' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
@@ -41,9 +43,9 @@
 #' if `x` is a vcf or packedancestry file)
 #' @param ... if `x` is the name of a vcf file, additional arguments
 #' passed to [vcfR::read.vcfR()]. Otherwise, unused.
-#' @param parser the name of the parser used for VCF, either "cpp" to use
+#' @param parser the name of the parser used for *VCF*, either "cpp" to use
 #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
-#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
+#' but more robust; if "cpp" gives error, try using "vcfR" in case your *VCF* has
 #' an unusual structure.
 #' @param n_cores the number of cores to use for parallel processing
 #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
@@ -54,10 +56,10 @@
 #' @param backingfile the path, including the file name without extension,
 #' for backing files used to store the data (they will be given a .bk
 #' and .RDS automatically). This is not needed if `x` is already an .RDS file.
-#' If `x` is a .BED file and `backingfile` is left NULL, the backing file will
+#' If `x` is a .BED or a *VCF* file and `backingfile` is left NULL, the backing file will
 #' be saved in the same directory as the
 #' bed file, using the same file name but with a different file type (.bk rather
-#' than .bed). The same logic applies to .vcf files. If `x` is a genotype matrix and `backingfile` is NULL, then a
+#' than .bed). If `x` is a genotype matrix and `backingfile` is NULL, then a
 #' temporary file will be created (but note that R will delete it at the end of
 #' the session!)
 #' @param quiet provide information on the files used to store the data
@@ -137,9 +139,8 @@ gen_tibble.character <-
     stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
   }
 
-  loci <- show_loci(x_gt)
-  new_loci <- add_chromosome_as_int(loci)
-  show_loci(x_gt) <- new_loci
+  # create a chr_int column
+  show_loci(x_gt)$chr_int <- cast_chromosome_to_int(show_loci(x_gt)$chromosome)
 
   file_in_use <- gt_save_light(x_gt, quiet = quiet)
   return(x_gt)
@@ -313,9 +314,8 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
                         remove_on_fail = TRUE)
   show_loci(new_gen_tbl) <- harmonise_missing_values(show_loci(new_gen_tbl), missing_alleles = missing_alleles)
 
-  loci <- show_loci(new_gen_tbl)
-  new_loci <- add_chromosome_as_int(loci)
-  show_loci(new_gen_tbl) <- new_loci
+  # create a chr_int column
+  show_loci(new_gen_tbl)$chr_int <- cast_chromosome_to_int(show_loci(new_gen_tbl)$chromosome)
 
   files_in_use <- gt_save(new_gen_tbl, quiet = quiet)
   return(new_gen_tbl)
@@ -572,39 +572,23 @@ change_duplicated_file_name <- function(file){
 }
 
 # adding a chr_int column
-add_chromosome_as_int <- function(loci){
-
-
-  if(is.integer(loci$chromosome)){
-    loci$chr_int <- loci$chromosome
-
-  } else if(is.numeric(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_int <- as.integer(loci$chromosome[non_na_id])
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
-  } else if(is.character(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_factor <- as.factor(loci$chromosome[non_na_id])
-    chr_int <- as.integer(chr_factor)
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
-
-  } else if(is.factor(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_int <- as.integer(loci$chromosome[non_na_id])
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
+cast_chromosome_to_int <- function(chromosome){
+  # if chromosome is an factor, then cast it to character
+  if (is.factor(chromosome)){
+    chromosome <- as.character(chromosome)
+  }
+  # if chromosome is a character, then cast it to integer
+  if (is.character(chromosome)){
+    chromosome <- tryCatch(as.integer(chromosome), warning = function(e) {as.integer(as.factor(chromosome))})
+  }
+  if (is.numeric(chromosome)){
+    chromosome <- as.integer(chromosome)
+  }
+  if (is.integer(chromosome)){
+    return(chromosome)
   } else {
-
     stop("Chromosome column should be integer, character, or factor")
   }
-  return(loci)
+
 }
 
diff --git a/R/gt_save.R b/R/gt_save.R
index feec44ca..46127ca1 100644
--- a/R/gt_save.R
+++ b/R/gt_save.R
@@ -44,7 +44,7 @@ gt_save <- function(x, file_name = NULL, quiet = FALSE) {
   return(c(file_name,gt_get_file_names(x)))
 }
 
-#' a light version of gt_save that does not resave the RDS, to be used internally
+#' a light version of gt_save that does not resave the bigSNP RDS or backing file, to be used internally
 #' when creating a gen_tibble (if we have just created it, there is not need
 #' to resave it)
 #' @param x a [`gen_tibble`]
diff --git a/R/loci_ld_clump.R b/R/loci_ld_clump.R
index 2b10280a..d6bee928 100644
--- a/R/loci_ld_clump.R
+++ b/R/loci_ld_clump.R
@@ -80,10 +80,16 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,
   # rows (individuals) that we want to use
   rows_to_keep <- vctrs::vec_data(.x)
   if (use_positions){
-    .positions <- show_loci(.x)$position
+    #.positions <- show_loci(.x)$position
+    #.positions <- attr(.x,"bigsnp")$map$physical.pos
+    .positions <- rep(NA, nrow(attr(.x,"bigsnp")$map))
+    .positions[show_loci(.x)$big_index] <- show_loci(.x)$position
   } else {
     .positions <- NULL
   }
+  # create a chromosome vector (fill gaps between bigsnpr and show_loci)
+  .chromosome <- rep(2147483647L, nrow(attr(.x,"bigsnp")$map))
+  .chromosome[show_loci(.x)$big_index] <- show_loci(.x)$chr_int
   # now figure out if we have any snp which have already been removed
   # those will go into `exclude`
   loci_not_in_tibble <- seq_len(nrow(attr(.x,"bigsnp")$map))[!seq_len(nrow(attr(.x,"bigsnp")$map)) %in%
@@ -95,7 +101,10 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,
 
   # as long as we have more than one individual
   snp_clump_ids <- bigsnpr::snp_clumping(G = attr(.x,"bigsnp")$genotypes,
-                        infos.chr = show_loci(.x)$chr_int,
+                        #infos.chr = show_loci(.x)$chr_int,
+                        # TEMP HACK using the info from the bigsnpr object
+                        #infos.chr = cast_chromosome_to_int(attr(.x,"bigsnp")$map$chromosome),
+                        infos.chr = .chromosome,
                         ind.row = vctrs::vec_data(.x),
                         S = S,
                         thr.r2 = thr_r2,
diff --git a/R/snp_king.R b/R/snp_king.R
index 8df9603f..0e056025 100644
--- a/R/snp_king.R
+++ b/R/snp_king.R
@@ -2,9 +2,6 @@
 #'
 #' This function computes the KING-robust estimator of kinship.
 #'
-#' The last step is not optimised yet, as it does the division of the num by the
-#' den all in memory (on my TODO list...).
-#'
 #' @param X a [bigstatsr::FBM.code256] matrix (as found in the `genotypes`
 #' slot of a [bigsnpr::bigSNP] object).
 #' @param ind.row An optional vector of the row indices that are used.
diff --git a/R/vcf_to_fbm_cpp.R b/R/vcf_to_fbm_cpp.R
index 3a64ccc0..9288d41d 100644
--- a/R/vcf_to_fbm_cpp.R
+++ b/R/vcf_to_fbm_cpp.R
@@ -86,11 +86,6 @@ vcf_to_fbm_cpp <- function(
   # add the new columns
   file_backed_matrix$add_columns(chunk_info$num_loci)
   # fill them in
-  # file_backed_matrix[
-  #   ,
-  #   index_start:(index_start+chunk_info$num_loci-1)
-  # ] <- genotypes_matrix[,1:chunk_info$num_loci]
-
   write_to_FBM(file_backed_matrix,
                allele_counts = genotypes_matrix,
                col_start = index_start-1,
diff --git a/R/vcf_to_fbm_vcfR.R b/R/vcf_to_fbm_vcfR.R
index b83ed3f8..73d38afb 100644
--- a/R/vcf_to_fbm_vcfR.R
+++ b/R/vcf_to_fbm_vcfR.R
@@ -153,7 +153,12 @@ poly_indiv_dosage <- function (x, max_ploidy){
 # get dosages for a genotype x and return them as raw for inclusion in the filebacked matrix
 poly_genotype_dosage <- function (x, max_ploidy){
   if (!is.na(x[1]) && x[1]!="."){
-    as.raw(sum(as.numeric(x)))
+    dosage <- sum(as.numeric(x))
+    if (dosage<(max_ploidy+1)){
+      return(as.raw(dosage))
+    } else{
+      stop("a genotype has more than max_ploidy alleles. We estimate max_ploidy from the first variant in the vcf file, make sure that variant is representative of ploidy (e.g. it is not on a sex chromosome).")
+    }
   } else {
     return(as.raw(max_ploidy+1))
   }
diff --git a/data-raw/ideas/allele_sharing.R b/data-raw/ideas/allele_sharing.R
deleted file mode 100644
index 9a145efa..00000000
--- a/data-raw/ideas/allele_sharing.R
+++ /dev/null
@@ -1,40 +0,0 @@
-library(hierfstat)
-set.seed(123)
-dos<-matrix(sample(0:2,size=10000,replace=TRUE),ncol=100)
-pop=rep(1:5,each=20)
-fs.dosage(dos,pop=pop)
-
-dos_match <- matching(dos)
-
-####
-test_genotypes <- rbind(c(1,1,0,1,1,0),
-                        c(2,1,0,NA,0,0),
-                        c(2,NA,0,0,1,1))
-test_indiv_meta <- data.frame (id=c("a","b","c"),
-                               population = c("pop1","pop1","pop2"))
-test_loci <- data.frame(name=paste0("rs",1:6),
-                        chromosome=paste0("chr",c(1,1,1,1,2,2)),
-                        position=as.integer(c(3,5,65,343,23,456)),
-                        genetic_dist = as.double(rep(0,6)),
-                        allele_ref = c("A","T","C","G","C","T"),
-                        allele_alt = c("T","C", NA,"C","G","A"))
-
-test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
-
-
-
-dos_hier_match <- hierfstat::matching(test_genotypes)
-
-test_fbm <- tidypopgen:::.gt_get_bigsnp(test_gt)$genotypes
-test_as <- snp_allele_sharing(test_fbm)
-
-
-
-dos <- test_genotypes
-
-na <- matrix(rep(1,prod(dim(dos))),ncol=ncol(dos))
-ina<-which(is.na(dos))
-na[ina]<-0
-dos[ina]<-1
-Mij <- 1/2 * (1+1/tcrossprod(na) * tcrossprod(dos-1))
-# Mij is the allele sharing matrix
diff --git a/data-raw/ideas/fst_gt_compare_to_hier.R b/data-raw/ideas/fst_gt_compare_to_hier.R
deleted file mode 100644
index 73d4ce09..00000000
--- a/data-raw/ideas/fst_gt_compare_to_hier.R
+++ /dev/null
@@ -1,111 +0,0 @@
-test_genotypes <- rbind(c(1,1,0,1,1,0),
-                        c(2,1,0,NA,0,0),
-                        c(2,NA,0,0,1,1),
-                        c(1,0,0,1,0,0),
-                        c(1,2,0,1,2,1),
-                        c(0,0,0,0,NA,1),
-                        c(0,1,1,0,1,NA))
-test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
-                               population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3"))
-test_loci <- data.frame(name=paste0("rs",1:6),
-                        chromosome=paste0("chr",c(1,1,1,1,2,2)),
-                        position=as.integer(c(3,5,65,343,23,456)),
-                        genetic_dist = as.double(rep(0,6)),
-                        allele_ref = c("A","T","C","G","C","T"),
-                        allele_alt = c("T","C", NA,"C","G","A"))
-
-test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
-test_gt <- test_gt %>% group_by(population)
-
-# a developer function to create various count summaries of a population, used to
-# compute more complex statistics (e.g. pairwise fst, etc.).
-# Specifically, we compute:
-# sums_alt sum alleles for alt
-# sums_ref sums alleles for ref
-# n sums of all alleles
-# het_obs observed heterozygosity
-# het_exp expected heterozygosity
-.gt_pop_freqs <- function(.x){
-  counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes,
-                                   ind.row =.gt_bigsnp_rows(.x),
-                                   ind.col = .gt_bigsnp_cols(.x))
-  sums_alt <- apply(counts,2,function(x) x[2]+2*x[3])
-  n <- apply(counts,2,function(x) sum(x[1:3])*2)
-  sums_ref <- n - sums_alt
-  freq_alt <- sums_alt/n
-  freq_ref <- 1- freq_alt
-#  het_count <- apply(counts,2,function(x) x[2]) # This should be simply a row from the counts matrix
-  het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3]))
-  #het_exp <- 2 * sums_alt/n * sums_ref/n
-  return (list(#sums_alt = sums_alt,
-               #sums_ref = sums_ref,
-               freq_alt = freq_alt,
-               freq_ref = freq_ref,
-               #het_count = het_count,
-               n = n,
-               het_obs = het_obs))
-}
-
-
-pairwise_pop_fst_nei83 <- function(.x, by_locus = FALSE){
-  # get the populations
-  .group_levels = .x %>% group_keys()
-  # create all combinations
-  pairwise_combn <- utils::combn(nrow(.group_levels),2)
-  # vector and matrix to store Fst for total and by locus
-  Fst_tot <- rep(NA_real_, ncol(pairwise_combn))
-  if (by_locus){
-    Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn))
-  }
-  # summarise population frequencies
-  pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x))
-  for (i_col in seq_len(ncol(pairwise_combn))){
-    pop1 <- pairwise_combn[1,i_col]
-    pop2 <- pairwise_combn[2,i_col]
-
-    n <-cbind(pop_freqs_df[[pop1]]$n,pop_freqs_df[[pop2]]$n)/2
-    sHo <-cbind(pop_freqs_df[[pop1]]$het_obs,pop_freqs_df[[pop2]]$het_obs)
-    mHo <- apply(sHo, 1, mean, na.rm = TRUE)
-    freq_alt <- cbind(pop_freqs_df[[pop1]]$freq_alt, pop_freqs_df[[pop2]]$freq_alt)
-    freq_ref <- cbind(pop_freqs_df[[pop1]]$freq_ref, pop_freqs_df[[pop2]]$freq_ref)
-
-    # sum of squared frequencies
-    sp2 <- freq_alt^2+freq_ref^2
-    Hs <- (1 - sp2 - sHo/2/n)
-    Hs <- n/(n - 1) * Hs
-    np <- apply(n, 1, fun <- function(x) sum(!is.na(x)))
-    # mean sample size over the populations
-    mn <- apply(n, 1, fun <- function(x) {
-      sum(!is.na(x))/sum(1/x[!is.na(x)])
-    })
-    # mean sum of square frequencies
-    msp2 <- apply(sp2, 1, mean, na.rm = TRUE)
-    mp2 <- rowMeans(freq_alt)^2+rowMeans(freq_ref)^2
-    mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn)
-    Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np
-
-    Dst <- Ht - mHs
-    if (by_locus){
-      Fst_locus[,i_col] = Dst/Ht
-    }
-    Fst_tot[i_col]<-mean(Dst)/mean(Ht)
-  }
-  # format nicely the objects
-  group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],])
-  names(group_combinations) <- c(paste0(dplyr::group_vars(.x),"_1"),paste0(dplyr::group_vars(.x),"_2"))
-  Fst_tot <- data.frame(group_combinations,value=Fst_tot)
-  if (by_locus){
-    rownames(Fst_locus)<-loci_names(.x)
-    colnames(Fst_locus)<- apply(group_combinations,1,function(x)paste(x,collapse = "."))
-  }
-  if (by_locus){
-    return(list(Fst_by_locus = Fst_locus, Fst_total = Fst_tot))
-  } else{
-    return(Fst_tot)
-  }
-}
-
-pairwise_pop_fst_nei83(test_gt, by_locus = TRUE)
-
-
-
diff --git a/data-raw/ideas/fst_hier.R b/data-raw/ideas/fst_hier.R
deleted file mode 100644
index 56ddf1b7..00000000
--- a/data-raw/ideas/fst_hier.R
+++ /dev/null
@@ -1,121 +0,0 @@
-## a full break down of the fst function, which includes Nei Fst
-
-test_genotypes <- rbind(c(1,1,0,1,1,0),
-                        c(2,1,0,NA,0,0),
-                        c(2,NA,0,0,1,1),
-                        c(1,0,0,1,0,0),
-                        c(1,2,0,1,2,1),
-                        c(0,0,0,0,NA,1),
-                        c(0,1,1,0,1,NA))
-test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
-                               population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3"))
-test_loci <- data.frame(name=paste0("rs",1:6),
-                        chromosome=paste0("chr",c(1,1,1,1,2,2)),
-                        position=as.integer(c(3,5,65,343,23,456)),
-                        genetic_dist = as.double(rep(0,6)),
-                        allele_ref = c("A","T","C","G","C","T"),
-                        allele_alt = c("T","C", NA,"C","G","A"))
-
-test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
-test_gt <- test_gt %>% group_by(population)
-test_sub_gt <- test_gt %>% filter(population%in%c("pop1","pop2"))
-
-# convert to hierfstat
-test_hier <- gt_as_hierfstat(test_sub_gt)
-
-#hier_basic <- hierfstat::basic.stats(test_hier)
-# note that Fstp, FIS and Dest are not simply averages
-
-#hier_fst_wc <- hierfstat::pairwise.WCfst(test_hier)
-#hier_fst_nei <- hierfstat::pairwise.neifst(test_hier)
-
-########################################################
-data<-test_hier
-diploid = TRUE
-digits=10
-dum.pop<-FALSE
-if (length(table(data[, 1])) < 2){
-  data[dim(data)[1] + 1, 1] <- "DumPop"
-  dum.pop<-TRUE
-}
-if (dim(data)[2] == 2)
-  data <- data.frame(data, dummy.loc = data[, 2])
-loc.names <- names(data)[-1]
-p <- hierfstat:::pop.freq(data, diploid)
-n <- t(hierfstat:::ind.count(data))
-if (diploid) {
-  dum <- hierfstat::getal.b(data[, -1])
-  Ho <- dum[, , 1] == dum[, , 2]
-  sHo <- (1 - t(apply(Ho, 2, fun <- function(x) tapply(x,
-                                                       data[, 1], mean, na.rm = TRUE))))
-  mHo <- apply(sHo, 1, mean, na.rm = TRUE)
-} else {
-  sHo <- NA
-  mHo <- NA
-}
-sp2 <- lapply(p, fun <- function(x) apply(x, 2, fun2 <- function(x) sum(x^2))) # cols sums of squares
-sp2 <- matrix(unlist(sp2), nrow = dim(data[, -1])[2], byrow = TRUE)
-if (diploid) {
-  Hs <- (1 - sp2 - sHo/2/n)
-  Hs <- n/(n - 1) * Hs
-  Fis = 1 - sHo/Hs
-} else {
-  Hs <- n/(n - 1) * (1 - sp2)
-  Fis <- NA
-}
-np <- apply(n, 1, fun <- function(x) sum(!is.na(x)))
-mn <- apply(n, 1, fun <- function(x) {
-  np <- sum(!is.na(x))
-  np/sum(1/x[!is.na(x)])
-})
-msp2 <- apply(sp2, 1, mean, na.rm = TRUE)
-mp <- lapply(p, fun <- function(x) apply(x, 1, mean, na.rm = TRUE))
-mp2 <- unlist(lapply(mp, fun1 <- function(x) sum(x^2)))
-if (diploid) {
-  mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn)
-  Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np
-  mFis = 1 - mHo/mHs
-} else {
-  mHs <- mn/(mn - 1) * (1 - msp2)
-  Ht <- 1 - mp2 + mHs/mn/np
-  mFis <- NA
-}
-Dst <- Ht - mHs
-Dstp <- np/(np - 1) * Dst
-Htp = mHs + Dstp
-Fst = Dst/Ht
-Fstp = Dstp/Htp
-Dest <- Dstp/(1 - mHs)
-res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst,
-                        Fstp, mFis, Dest))
-names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp",
-                "Fst", "Fstp", "Fis", "Dest")
-if (diploid) {
-  rownames(sHo) <- loc.names
-  rownames(Fis) <- loc.names
-}
-is.na(res) <- do.call(cbind, lapply(res, is.infinite))
-overall <- apply(res, 2, mean, na.rm = TRUE)
-overall[7] <- overall[4]/overall[3]
-overall[8] <- overall[6]/overall[5]
-overall[9] <- 1 - overall[1]/overall[2]
-overall[10] <- overall[6]/(1 - overall[2])
-names(overall) <- names(res)
-if (!diploid) {
-  overall[c(1, 9)] <- NA
-}
-if(dum.pop){
-  ToBeRemoved<-which(dimnames(Hs)[[2]]=="DumPop")
-  n<-n[,-ToBeRemoved,drop=FALSE]
-  Hs<-Hs[,-ToBeRemoved,drop=FALSE]
-  if (diploid) sHo<-sHo[,-ToBeRemoved,drop=FALSE]
-  Fis<-Fis[,-ToBeRemoved,drop=FALSE]
-  p<-lapply(p,function(x) x[,-ToBeRemoved,drop=FALSE])
-}
-all.res <- list(n.ind.samp = n, pop.freq = lapply(p, round,
-                                                  digits), Ho = round(sHo, digits), Hs = round(Hs, digits),
-                Fis = round(Fis, digits), perloc = round(res, digits),
-                overall = round(overall, digits))
-class(all.res) <- "basic.stats"
-all.res
-
diff --git a/data-raw/ideas/fst_pairwise_wc_nei_something_is_wrong.R b/data-raw/ideas/fst_pairwise_wc_nei_something_is_wrong.R
deleted file mode 100644
index 4fafde10..00000000
--- a/data-raw/ideas/fst_pairwise_wc_nei_something_is_wrong.R
+++ /dev/null
@@ -1,93 +0,0 @@
-
-pairwise_pop_fst_wc <- function(.x, by_locus=FALSE){
-
-  message("this function is not properly tested yet!!!")
-  message("if you have time, test the results against something like hierfstat and create a unit test")
-  # get the populations
-  .group_levels = .x %>% group_keys()
-  # create all combinations
-  pairwise_combn <- utils::combn(nrow(.group_levels),2)
-  # vector and matrix to store Fst for total and by locus
-  Fst_tot <- rep(NA_real_, ncol(pairwise_combn))
-  if (by_locus){
-    Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn))
-  }
-  # summarise population frequencies
-  pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x))
-  for (i_col in seq_len(ncol(pairwise_combn))){
-    pop1 <- pairwise_combn[1,i_col]
-    pop2 <- pairwise_combn[2,i_col]
-    n1 <- pop_freqs_df[[pop1]]$n
-    n2 <- pop_freqs_df[[pop2]]$n
-    p1 <- pop_freqs_df[[pop1]]$freq_alt
-    p2 <- pop_freqs_df[[pop2]]$freq_alt
-    q1 <- pop_freqs_df[[pop1]]$freq_ref
-    q2 <- pop_freqs_df[[pop2]]$freq_ref
-    a <- (n1*n2)/(n1+n2)
-    b <- (1/(n1+n2-2))*((n1*p1*q1)+(n2*p2*q2))
-    c <- (p1-p2)^2
-    numerator <- 2*a*b
-    denominator <- a*c+(2*a-1)*b
-    if (by_locus){
-      Fst_locus[,i_col] = 1-numerator/denominator
-    }
-    Fst_tot[i_col]<-1-mean(numerator)/mean(denominator)
-  }
-  # format nicely the objects
-  group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],])
-  names(group_combinations) <- c(paste0(dplyr::group_vars(.x),"_1"),paste0(dplyr::group_vars(.x),"_2"))
-  Fst_tot <- tibble::tibble(group_combinations,value=Fst_tot)
-  if (by_locus){
-    rownames(Fst_locus)<-loci_names(.x)
-    colnames(Fst_locus)<- apply(group_combinations,1,function(x)paste(x,collapse = "."))
-  }
-  if (by_locus){
-    return(list(Fst_by_locus = Fst_locus, Fst = Fst_tot))
-  } else{
-    return(Fst_tot)
-  }
-}
-
-pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE){
-
-  message("this function is not properly tested yet!!!")
-  message("if you have time, test the results against something like hierfstat and create a unit test")
-  # get the populations
-  .group_levels = .x %>% group_keys()
-  # create all combinations
-  pairwise_combn <- utils::combn(nrow(.group_levels),2)
-  # vector and matrix to store Fst for total and by locus
-  Fst_tot <- rep(NA_real_, ncol(pairwise_combn))
-  if (by_locus){
-    Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn))
-  }
-  # summarise population frequencies
-  pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x))
-  for (i_col in seq_len(ncol(pairwise_combn))){
-    pop1 <- pairwise_combn[1,i_col]
-    pop2 <- pairwise_combn[2,i_col]
-    p1 <- pop_freqs_df[[pop1]]$freq_alt
-    p2 <- pop_freqs_df[[pop2]]$freq_alt
-    pbar <- (p1+p2)/2
-    numerator <-  ((p1-p2)^2)
-    denominator <-  (2*pbar*(1-pbar))
-    if (by_locus){
-      Fst_locus[,i_col] = numerator/denominator
-    }
-    Fst_tot[i_col]<-mean(numerator)/mean(denominator)
-  }
-  # format nicely the objects
-  group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],])
-  names(group_combinations) <- c(paste0(dplyr::group_vars(.x),"_1"),paste0(dplyr::group_vars(.x),"_2"))
-  Fst_tot <- tibble::tibble(group_combinations,value=Fst_tot)
-  if (by_locus){
-    rownames(Fst_locus)<-loci_names(.x)
-    colnames(Fst_locus)<- apply(group_combinations,1,function(x)paste(x,collapse = "."))
-  }
-  if (by_locus){
-    return(list(Fst_by_locus = Fst_locus, Fst = Fst_tot))
-  } else{
-    return(Fst_tot)
-  }
-}
-
diff --git a/data-raw/ideas/gt_roh.R b/data-raw/ideas/gt_roh.R
deleted file mode 100644
index cd81bfd6..00000000
--- a/data-raw/ideas/gt_roh.R
+++ /dev/null
@@ -1,103 +0,0 @@
-#' Detect runs of homozygosity based on windows
-#'
-#' This function uses a sliding-window approach to look for runs of homozygosity (or
-#' heterozygosity) in a diploid genome. This function uses the package `selectRUNS`,
-#' which implements an approach equivalent to the one in PLINK.
-#'
-#' @param x a [gen_tibble]
-#' @param window_size the size of sliding window (number of SNP loci) (default = 15)
-#' @param threshold the threshold of overlapping windows of the same state
-#' (homozygous/heterozygous) to call a SNP in a RUN (default = 0.05)
-#' @param min_snp minimum n. of SNP in a RUN (default = 3)
-#' @param heterozygosity should we look for runs of heterozygosity (instead of homozygosity? (default = FALSE)
-#' @param max_opp_window max n. of SNPs of the opposite type (e.g. heterozygous
-#' snps for runs of homozygosity) in the
-#' sliding window (default = 1)
-#' @param max_miss_window max. n. of missing SNP in the sliding window (default = 1)
-#' @param max_gap max distance between consecutive SNP to be still considered a
-#' potential run (default = 10^6 bps)
-#' @param min_length_bps minimum length of run in bps (defaults to 1000 bps = 1 kbps)
-#' @param min_density minimum n. of SNP per kbps (defaults to 0.1 = 1 SNP every 10 kbps)
-#' @param max_opp_run max n. of opposite genotype SNPs in the run (optional)
-#' @param max_miss_run max n. of missing SNPs in the run (optional)
-#'
-#' @details
-#' This function returns a data frame with all runs detected in the dataset.
-#' This data frame can then be written out to a csv file.
-#' The data frame is, in turn, the input for other functions of the detectRUNS package
-#' that create plots and produce statistics from the results
-#' (see plots and statistics functions in this manual,
-#' and/or refer to the detectRUNS vignette).
-#'
-#' If the [`gen_tibble`] is grouped, then the grouping variable is used to fill in the
-#' group table. Otherwise, the group 'column' is filled with the same values as the 'id'
-#' column
-#'
-#' @return A dataframe with RUNs of Homozygosity or Heterozygosity in the analysed dataset.
-#' The returned dataframe contains the following seven columns: "group", "id", "chrom",
-#' "nSNP", "from", "to", "lengthBps" (group: population, breed, case/control etc.;
-#' id: individual identifier; chrom: chromosome on which the run is located;
-#' nSNP: number of SNPs in the run; from: starting position of the run, in bps;
-#' to: end position of the run, in bps; lengthBps: size of the run)
-#' @export
-#'
-#' @examples
-#' # don't run the example
-#' if (FALSE) {
-#' sheep_bed <- systems.file("extdata/sheep.bed", package="tidypopgen")
-#' sheep_gt <- tidypopgen::gen_tibble(bed_path, backingfile = tempfile(), quiet=TRUE)
-#' sheep_roh <- gt_roh(sheep_gt)
-#' }
-
-
-gt_roh <-  function(x, window_size = 15, threshold = 0.05,
-                    min_snp = 3, ROHet = FALSE, max_opp_window = 1, max_miss_window = 1,
-                    max_gap = 10^6, min_length_bps = 1000, min_density = 1/1000,
-                    max_opp_run = NULL, max_miss_run = NULL) {
-  if (!requireNamespace("detectRUNS", quietly = TRUE)) {
-    stop(
-      "to use this function, first install package 'detectRUNS' with\n",
-      "install.packages('detectRUNS')")
-  }
-  # collect all parameters in a variable
-  parameters <- list(windowSize=window_size, threshold=threshold, minSNP=min_snp,
-                     ROHet=ROHet, maxOppWindow=max_opp_window,
-                     maxMissWindow=max_miss_window, maxGap=max_gap, minLengthBps=min_length_bps,
-                     minDensity=min_density, maxOppRun=max_opp_run, maxMissRun=max_miss_run)
-
-  # create a map object
-  map <- show_loci(x) %>% dplyr::select(dplyr::all_of(c("chromosome","name","position"))) %>%
-    dplyr::rename("Chrom"="chromosome", "SNP"="name", "bps"="position") %>% as.data.frame()
-
-  # compute the gaps between snps
-  gaps <- diff(map$bps)
-  # use groups (if defined)
-  if (dplyr::is_grouped_df(sheep_gt)){
-    groups <- x %>% dplyr::group_keys()
-  } else {
-    groups <- x$id
-  }
-
-  # initialize data.frame of results
-  runs_df <- data.frame(group=character(), id=character(), chrom=character(), nSNP=integer(),
-                     from=integer(), to=integer(), lengthBps=integer())
-  # naively process it by row (the parallelism is implemented within individual)
-  # access time is horrible, but I don't think this is the bottleneck
-  # it needs some profiling
-  X <- .gt_get_bigsnp(x)$genotypes # pointer for the FBM
-  for (i in seq_len(nrow(x))){
-    this_genotype <- X[i,]
-    this_indiv <- list(FID=groups[i], IID=x$id[i])
-    # find runs for this individual
-    this_runs <- utils::getFromNamespace("slidingRuns", "detectRUNS")(this_genotype, this_indiv, map, gaps, parameters)
-    # bind this run (if has rows) to others RUNs (if any)
-    runs_df <- rbind(runs_df, this_runs)
-  }
-
-  # fix row names
-  row.names(runs_df) <- NULL
-
-  # return calculated runs (data.frame)
-  return(runs_df)
-
-}
diff --git a/data-raw/ideas/hierfstat_summary_stats.R b/data-raw/ideas/hierfstat_summary_stats.R
deleted file mode 100644
index 7b96187f..00000000
--- a/data-raw/ideas/hierfstat_summary_stats.R
+++ /dev/null
@@ -1,13 +0,0 @@
-# hierfstat notes
-fis.dosage(dos_hier_match,matching=TRUE, pop=test_indiv_meta$population)
-
-fst.dosage(dos_hier_match,matching=TRUE, pop=test_indiv_meta$population)
-
-# these return a vector of n populations plus a global value
-# write a function that returns the n vector, and add a parameter that allows the inclusion
-# of the global value
-
-
-## For Hudson Fst, we could check against:
-# https://www.rdocumentation.org/packages/KRIS/versions/1.1.6/topics/fst.hudson
-# also have a look at fst4pg
diff --git a/data-raw/ideas/pop_within_stats.R b/data-raw/ideas/pop_within_stats.R
deleted file mode 100644
index a753f39f..00000000
--- a/data-raw/ideas/pop_within_stats.R
+++ /dev/null
@@ -1,41 +0,0 @@
-.x <- pinus_gt
-.x <- .x[1:5,] %>% select_loci(1:10)
-# we compute quantities by locus (the can then be combined for individuals later)
-# counts of genotypes
-.counts <- bigstatsr::big_counts(.gt_get_bigsnp(.x)$genotypes,
-                                 ind.row = .gt_bigsnp_rows(.x),
-                                 ind.col = .gt_bigsnp_cols(.x))
-# number of alleles
-.n_k <- rbind(.counts[1,]*2+.counts[2,],
-              .counts[3,]*2+.counts[2,])
-# remove invariant sites
-to_remove <- (.n_k[1,]==0 | .n_k[2,]==0)
-.counts <- .counts[,!to_remove]
-.n_k <- .n_k[,!to_remove]
-
-# total sample size
-.N <- apply(.counts[1:3,],2,sum)
-# frequencies of alleles
-.p <- .n_k[1,]/(.N*2)
-.q <- 1- .p
-# observed heterozygosity
-h_o <- .counts[2,]/.N
-# unbiased within pop gene diversity (h_s)
-h_s <- (1-(.p^2+.q^2))*(2*.N/(2*.N-1))
-
-Fs <- (h_s - h_o)/h_s
-# if it is a grouped tibble with multiple populations
-
-# overall gene diversity
-
-
-
-##########################################
-# convenient functs
-.gt_bigsnp_cols <- function(.x){
-  show_loci(.x)$big_index
-}
-
-.gt_bigsnp_rows <- function(.x){
-  vctrs::vec_data(.x$genotypes)
-}
diff --git a/data-raw/ideas/roh_ideas.R b/data-raw/ideas/roh_ideas.R
deleted file mode 100644
index b5056a9f..00000000
--- a/data-raw/ideas/roh_ideas.R
+++ /dev/null
@@ -1,128 +0,0 @@
-slidingRUNS.run <- function(x, windowSize = 15, threshold = 0.05,
-                            minSNP = 3, ROHet = FALSE, maxOppWindow = 1, maxMissWindow = 1,
-                            maxGap = 10^6, minLengthBps = 1000, minDensity = 1/1000,
-                            maxOppRun = NULL, maxMissRun = NULL) {
-
-  # TODO check that snps are in order (we do that for LD, I think)
-
-
-  # collect all parameters in a variable
-  parameters <- list(windowSize=windowSize, threshold=threshold, minSNP=minSNP,
-                     ROHet=ROHet, maxOppWindow=maxOppWindow,
-                     maxMissWindow=maxMissWindow, maxGap=maxGap, minLengthBps=minLengthBps,
-                     minDensity=minDensity, maxOppRun=maxOppRun, maxMissRun=maxMissRun)
-
-  # calculate gaps
-  # TODO
-  gaps <- diff(mapFile$bps)
-
-  # initialize data.frame of results
-  RUNs <- data.frame(group=character(), id=character(), chrom=character(), nSNP=integer(),
-                     from=integer(), to=integer(), lengthBps=integer())
-
-  chromosomes <- unique(show_loci(x)$chromosome)
-
-  window_runs_chromosome <- function (X, ind, indiv_name, indiv_group = NULL,parameters){
-
-  }
-
-  # pointer to the FBM
-  X <- .gt_get_bigsnp(x)$genotypes
-
-  for (i_chromosome in chromosomes){
-    # get loci for this chromosome
-    rows_this_chrom <- show_loci(x)$big_snp_locus
-
-
-
-
-  }
-
-  # read file line by line (http://stackoverflow.com/questions/4106764/what-is-a-good-way-to-read-line-by-line-in-r)
-  while (length(oneLine <- readLines(conn, n = 1, warn = FALSE)) > 0) {
-
-    # get individual
-     individual <- list(FID=genotype[1], IID=genotype[2])
-
-    # find runs for this individual
-    a_run <- detectRUNS:::slidingRuns(genotype, animal, mapFile, gaps, parameters)
-
-    # bind this run (if has rows) to others RUNs (if any)
-    RUNs <- rbind(RUNs, a_run)
-
-  }
-
-  # close input stream
-  close(conn)
-
-  # fix row names
-  row.names(RUNs) <- NULL
-
-  # return calculated runs (data.frame)
-  return(RUNs)
-}
-
-
-#' Function to detect runs using sliding window approach
-#'
-#' This is a core function not intended to be exported
-#'
-#' @param indGeno vector of 0/1/NAs of individual genotypes (0: homozygote; 1: heterozygote)
-#' @param individual list of group (breed, population, case/control etc.) and ID of individual sample
-#' @param mapFile Plink map file (for SNP position)
-#' @param gaps distance between SNPs
-#' @param parameters list of parameters
-#' @param cpp use cpp functions or not (DEBUG)
-#'
-#' @details
-#' This method uses sliding windows to detect RUNs. Checks on minimum n. of SNP, max n. of opposite and missing genotypes,
-#' max gap between adjacent loci and minimum length of the run are implemented (as in the sliding window method).
-#' Both runs of homozygosity (RoHom) and of heterozygosity (RoHet) can be search for (option ROHet: TRUE/FALSE)
-#' NOTE: this methods is intended to not be exported
-#'
-#' @return A data frame of runs per individual sample
-#' @keywords internal
-#'
-
-slidingRuns <- function(indGeno, individual, mapFile, gaps, parameters, cpp=TRUE) {
-  # get individual and group
-  ind <- as.character(individual$IID)
-  group <- as.character(individual$FID)
-
-  # use sliding windows (check cpp)
-  if (cpp == TRUE) {
-    res <- slidingWindowCpp(indGeno, gaps, parameters$windowSize, step=1,
-                            parameters$maxGap, parameters$ROHet, parameters$maxOppWindow,
-                            parameters$maxMissWindow);
-
-    snpRun <- snpInRunCpp(res$windowStatus, parameters$windowSize, parameters$threshold)
-  } else {
-    res <- slidingWindow(indGeno, gaps, parameters$windowSize, step=1,
-                         parameters$maxGap, parameters$ROHet, parameters$maxOppWindow,
-                         parameters$maxMissWindow);
-
-    snpRun <- snpInRun(res$windowStatus, parameters$windowSize, parameters$threshold)
-  }
-
-
-  # TODO: check arguments names
-  dRUN <- createRUNdf(snpRun, mapFile, parameters$minSNP, parameters$minLengthBps,
-                      parameters$minDensity, res$oppositeAndMissingGenotypes,
-                      parameters$maxOppRun, parameters$maxMissRun)
-
-  # manipulate dRUN to order columns
-  dRUN$id <- rep(ind, nrow(dRUN))
-  dRUN$group <- rep(group, nrow(dRUN))
-  dRUN <- dRUN[,c(7,6,4,3,1,2,5)]
-
-  # debug
-  if(nrow(dRUN) > 0) {
-    message(paste("N. of RUNS for individual", ind, "is:", nrow(dRUN)))
-  } else {
-    message(paste("No RUNs found for animal", ind))
-  }
-
-  #return RUNs to caller
-  return(dRUN)
-}
-
diff --git a/inst/extdata/haploid_first_pop_a.vcf b/inst/extdata/haploid_first_pop_a.vcf
index cfeba725..47a1bfdd 100644
--- a/inst/extdata/haploid_first_pop_a.vcf
+++ b/inst/extdata/haploid_first_pop_a.vcf
@@ -8,7 +8,7 @@
 ##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	popa_GRC14300079	popa_GRC14300142	popa_GRC14300159	popa_GRC14300286	popa_GRC14300305
-23	51433071	rs5945676	T	.	.	.	PR	GT	0	0	0	0	0
+23	51433071	rs5945676	T	.	.	.	PR	GT	1	0	0	0	0
 1	82154	rs4477212	A	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
 1	752566	rs3094315	A	G	.	.	PR	GT	0/0	0/0	0/1	0/0	0/0
 1	752721	rs3131972	G	A	.	.	PR	GT	0/0	0/0	0/1	0/0	0/1
diff --git a/inst/extdata/haploid_middle_pop_a.vcf b/inst/extdata/haploid_middle_pop_a.vcf
new file mode 100644
index 00000000..58ed3154
--- /dev/null
+++ b/inst/extdata/haploid_middle_pop_a.vcf
@@ -0,0 +1,26 @@
+##fileformat=VCFv4.2
+##fileDate=20240510
+##source=PLINKv1.90
+##contig=<ID=1,length=1346906>
+##contig=<ID=2,length=235832764>
+##contig=<ID=3,length=155913652>
+##contig=<ID=23,length=51433072>
+##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	popa_GRC14300079	popa_GRC14300142	popa_GRC14300159	popa_GRC14300286	popa_GRC14300305
+1	82154	rs4477212	A	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+1	752566	rs3094315	A	G	.	.	PR	GT	0/0	0/0	0/1	0/0	0/0
+1	752721	rs3131972	G	A	.	.	PR	GT	0/0	0/0	0/1	0/0	0/1
+1	776546	rs12124819	A	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+23	51433071	rs5945676	T	C	.	.	PR	GT	1	0	0	0	0
+1	798959	rs11240777	G	A	.	.	PR	GT	0/0	0/0	1/1	0/0	0/1
+1	873558	rs1110052	G	T	.	.	PR	GT	0/1	0/0	0/1	0/0	1/1
+1	934345	rs9697457	G	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+1	957640	rs6657048	C	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+1	994391	rs2488991	T	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+1	1264539	rs307354	C	G	.	.	PR	GT	0/0	0/1	0/0	0/1	0/0
+1	1346905	rs1240719	T	A	.	.	PR	GT	0/0	0/1	0/0	0/0	0/0
+2	61974443	rs2862633	G	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+2	139008811	rs28569024	T	.	.	.	PR	GT	0/0	0/0	0/0	0/0	0/0
+2	235832763	rs10106770	G	A	.	.	PR	GT	0/1	0/0	0/1	0/1	0/1
+3	155913651	rs11942835	T	C	.	.	PR	GT	0/0	0/1	0/0	0/1	0/0
diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd
index f408a707..9ec93fac 100644
--- a/man/gen_tibble.Rd
+++ b/man/gen_tibble.Rd
@@ -48,9 +48,9 @@ directory and have the same file name.
 \item a string giving the path to a RDS file storing a \code{bigSNP} object from
 the \code{bigsnpr} package (usually created with \code{\link[bigsnpr:snp_readBed]{bigsnpr::snp_readBed()}})
 \item a string giving the path to a vcf file. Note that we currently read the whole
-vcf in memory with \code{vcfR}, so only smallish vcf can be imported. Only biallelic
+vcf in memory with \code{vcfR}, so only smallish \emph{VCF} can be imported. Only biallelic
 SNPs will be considered.
-\item a string giving the path to a packedancestry .geno file. The associated
+\item a string giving the path to a \emph{packedancestry} .geno file. The associated
 .ind and .snp files are expected to be in the same directory and share the
 same file name prefix.
 \item a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
@@ -70,18 +70,18 @@ locus with only one allele). It defaults to '0' and '.' (the same as PLINK 1.9).
 \item{backingfile}{the path, including the file name without extension,
 for backing files used to store the data (they will be given a .bk
 and .RDS automatically). This is not needed if \code{x} is already an .RDS file.
-If \code{x} is a .BED file and \code{backingfile} is left NULL, the backing file will
+If \code{x} is a .BED or a \emph{VCF} file and \code{backingfile} is left NULL, the backing file will
 be saved in the same directory as the
 bed file, using the same file name but with a different file type (.bk rather
-than .bed). The same logic applies to .vcf files. If \code{x} is a genotype matrix and \code{backingfile} is NULL, then a
+than .bed). If \code{x} is a genotype matrix and \code{backingfile} is NULL, then a
 temporary file will be created (but note that R will delete it at the end of
 the session!)}
 
 \item{quiet}{provide information on the files used to store the data}
 
-\item{parser}{the name of the parser used for VCF, either "cpp" to use
+\item{parser}{the name of the parser used for \emph{VCF}, either "cpp" to use
 a fast C++ parser, or "vcfR" to use the R package \code{vcfR}. The latter is slower
-but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
+but more robust; if "cpp" gives error, try using "vcfR" in case your \emph{VCF} has
 an unusual structure.}
 
 \item{n_cores}{the number of cores to use for parallel processing}
@@ -114,13 +114,15 @@ here the format
 }
 \details{
 \itemize{
-\item \emph{VCF} files: the fast \code{cpp} parser is used by default. It attempts to establish
-ploidy from the first variant; if that variant is found in a sex chromosome (or mtDNA),
-it will fail. To use the fast parser, change the order of variants so that the first chromosome is an
-autosome using a tool such as \code{vcftools}. Alternatively, the slower but more robust
-\code{vcfR} parser should be used instead. Currently, only biallelic SNPs are supported. If haploid
-variants (e.g. sex chromosomes) are included in the vcf, they are not transformed into homozygous calls.
-Instad, reference alleles will be counted as 0 and alternative alleles will be counted as 1.
+\item \emph{VCF} files: the fast \code{cpp} parser is used by default. Both \code{cpp} and \code{vcfR} parsers
+attempt to establish ploidy from the first variant; if that variant is found in a
+sex chromosome (or mtDNA), the parser will fail with 'Error: a genotype has more
+than max_ploidy alleles...'. To successful import such a \emph{VCF}, change the order of variants
+so that the first chromosome is an autosome using a tool such as \code{vcftools}.
+Currently, only biallelic SNPs are supported. If haploid variants (e.g. sex
+chromosomes) are included in the \emph{VCF}, they are not transformed into homozygous
+calls. Instead, reference alleles will be counted as 0 and alternative alleles
+will be counted as 1.
 \item \emph{packedancestry} files: When loading \emph{packedancestry} files, missing alleles will be converted from
 'X' to NA
 }
diff --git a/man/gt_save_light.Rd b/man/gt_save_light.Rd
index 82707ad1..187a24eb 100644
--- a/man/gt_save_light.Rd
+++ b/man/gt_save_light.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/gt_save.R
 \name{gt_save_light}
 \alias{gt_save_light}
-\title{a light version of gt_save that does not resave the RDS, to be used internally
+\title{a light version of gt_save that does not resave the bigSNP RDS or backing file, to be used internally
 when creating a gen_tibble (if we have just created it, there is not need
 to resave it)}
 \usage{
@@ -21,7 +21,7 @@ the file name and path of the \emph{.gt} file, together with the \emph{.rds}
 and \emph{.bk} files
 }
 \description{
-a light version of gt_save that does not resave the RDS, to be used internally
+a light version of gt_save that does not resave the bigSNP RDS or backing file, to be used internally
 when creating a gen_tibble (if we have just created it, there is not need
 to resave it)
 }
diff --git a/man/snp_king.Rd b/man/snp_king.Rd
index bdb69bb0..7b6aae62 100644
--- a/man/snp_king.Rd
+++ b/man/snp_king.Rd
@@ -26,7 +26,3 @@ specified, all columns are used. Don't use negative indices.}
 \description{
 This function computes the KING-robust estimator of kinship.
 }
-\details{
-The last step is not optimised yet, as it does the division of the num by the
-den all in memory (on my TODO list...).
-}
diff --git a/src/gt_grouped_summaries.cpp b/src/gt_grouped_summaries.cpp
index 414b3a6a..f65751ca 100644
--- a/src/gt_grouped_summaries.cpp
+++ b/src/gt_grouped_summaries.cpp
@@ -39,7 +39,7 @@ ListOf<NumericMatrix> gt_grouped_summaries(Environment BM,
       }
     }
     // now for each group, divide freq by valid_alleles
-    for (size_t group_i = 0; group_i < ngroups; group_i++) {
+    for (int group_i = 0; group_i < ngroups; group_i++) {
       freq(j, group_i) = freq(j, group_i) / valid_alleles(j, group_i);
       ref_freq(j, group_i) = 1-freq(j, group_i);
       heterozygotes(j, group_i) = heterozygotes(j, group_i) / valid_alleles(j, group_i);
diff --git a/src/vcf_parser.cpp b/src/vcf_parser.cpp
index 104514e0..beeaf70c 100644
--- a/src/vcf_parser.cpp
+++ b/src/vcf_parser.cpp
@@ -24,15 +24,35 @@ std::vector<std::string> split(const std::string &s, const std::string &delimite
 
 // Function to count the number of alternate alleles from genotype information
 // missingValue is max_ploidy +1
-int countAlternateAlleles(const std::string &genotype, const int missingValue, const int ploidy) {
+int countAlternateAlleles(const std::string &genotype, const size_t missingValue) {
+  // Rcout<<genotype<<std::endl;
   int count = 0;
   if (genotype[0] == '.'){
     return missingValue;
   }
-  for (int pos_i = 0; pos_i < ploidy; pos_i++){
-    if (genotype[0+pos_i*2] == '1'){
+
+  //For odd positions, we have the allele; for even positions, we might have : for end of genotype field
+  for (size_t pos_i = 0; pos_i < genotype.size(); pos_i++){
+    if (genotype[pos_i] == '1'){
       count++;
     }
+
+    pos_i++;
+    // Rcout<<pos_i<<"\t"<<missingValue<<std::endl;
+    // if (genotype=="1/1"){
+    //   Rcout<<pos_i<<"\t"<<missingValue<<std::endl;
+    // }
+    if (pos_i> ((missingValue-1)*2)){
+      Rcpp::stop("a genotype has more than max_ploidy alleles. We estimate max_ploidy from the first variant in the vcf file, make sure that variant is representative of ploidy (e.g. it is not on a sex chromosome).");
+    }
+
+    //
+    if (pos_i< genotype.size()){ // don't read from string if we got beyond the limit
+      if (genotype[pos_i] == ':'){
+        break;
+      }
+    }
+
   }
   return count;
 }
@@ -87,6 +107,7 @@ List extractAltAlleleCountsFromVCF(std::string filename,
   allele2.reserve(maxLoci);
 
   std::string line;
+  std::string this_chromosome = ""; // chromosome of the current line
   int lociCount = 0; // number of biallelic loci
   int lineCount = 0; // number of valid lines
   int skippedLoci = 0;
@@ -130,7 +151,7 @@ List extractAltAlleleCountsFromVCF(std::string filename,
       // Collect alternate allele counts for all individuals
       if (fields[4][0] != '.'){
         for (unsigned int i = 9; i < fields.size(); ++i) {
-          int count = countAlternateAlleles(fields[i], missingValue, ploidy[i-9]);
+          int count = countAlternateAlleles(fields[i], missingValue);
           allele_counts(i - 9, lociCount) = count;
         }
       } else { // if this is an invariant site
diff --git a/src/write_to_FBM.cpp b/src/write_to_FBM.cpp
index 2ef6e30c..118bbe1f 100644
--- a/src/write_to_FBM.cpp
+++ b/src/write_to_FBM.cpp
@@ -20,7 +20,7 @@ void write_to_FBM (Environment BM,
 
   #pragma omp parallel for num_threads(ncores)
   for (int locus_i = 0; locus_i < n_loci; locus_i++){
-    for (int indiv_i = 0; indiv_i < n_indiv; indiv_i++)
+    for (size_t indiv_i = 0; indiv_i < n_indiv; indiv_i++)
       macc_fbm(indiv_i,col_start+locus_i) = allele_counts(indiv_i, locus_i);
   }
 
diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R
index d4a8ff83..a803c8fc 100644
--- a/tests/testthat/test_gen_tibble.R
+++ b/tests/testthat/test_gen_tibble.R
@@ -1,3 +1,4 @@
+
 # create file
 test_indiv_meta <- data.frame (id=c("a","b","c"),
                                population = c("pop1","pop1","pop2"))
@@ -159,6 +160,72 @@ test_that("gen_tibble identifies wrong loci table columns",{
 })
 
 
+test_that("PROBLEM TEST gentibble from VCF with missingness issue",{
+  ########################
+  # PLINK BED files
+  ########################
+  bed_path <- system.file("extdata/pop_b.bed", package = "tidypopgen")
+  pop_b_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile())
+
+  ########################
+  # PLINK VCF files
+  ########################
+  vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen")
+  pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
+                             parser="vcfR")
+  expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt)))
+  # reload it in chunks
+  pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
+                              chunk_size = 2, parser = "vcfR")
+  expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt)))
+  expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt)))
+  expect_true(is.integer(show_loci(pop_b_vcf_gt2)$chr_int))
+
+  # check our cpp parser
+  pop_b_vcf_fast_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), parser="cpp")
+
+  # debug
+  # if (!identical(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_fast_gt))) {
+  #   stop("genotypes are not equal \n",
+  #   print(show_genotypes(pop_b_gt)),"\n",
+  #   print(show_genotypes(pop_b_vcf_fast_gt)))
+  # }
+  #
+  # for(i in 1:10){
+  #
+  #   genotypes_info <- paste0(
+  #     "Iteration: ", i, "\n",
+  #     "Pop B GT: ", show_genotypes(pop_b_gt), "\n",
+  #     "Pop B VCF Fast GT: ", show_genotypes(pop_b_vcf_fast_gt)
+  #   )
+  #   expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_fast_gt)),info = genotypes_info)
+  # }
+  # check loci table against the vcfR parser
+  expect_true(all.equal(show_loci(pop_b_vcf_gt), show_loci(pop_b_vcf_fast_gt)))
+  # reload it in chunks
+  pop_b_vcf_fast_gt2 <- gen_tibble(vcf_path, quiet=TRUE, backingfile = tempfile(),
+                                   chunk_size = 2, parser="cpp")
+
+  # # debug
+  # print(show_genotypes(pop_b_vcf_fast_gt2))
+  # print(show_genotypes(pop_b_vcf_fast_gt))
+  #
+  # for(i in 1:10){
+  #
+  #   genotypes_info <- paste0(
+  #     "Iteration: ", i, "\n",
+  #     "Pop B GT: ", show_genotypes(pop_b_vcf_fast_gt), "\n",
+  #     "Pop B VCF Fast GT: ", show_genotypes(pop_b_vcf_fast_gt2)
+  #   )
+  #
+  #   expect_true(all.equal(show_genotypes(pop_b_vcf_fast_gt2),show_genotypes(pop_b_vcf_fast_gt)), info = genotypes_info)
+  # }
+
+  expect_true(all.equal(show_loci(pop_b_vcf_fast_gt2), show_loci(pop_b_vcf_fast_gt)))
+  expect_true(is.integer(show_loci(pop_b_vcf_fast_gt2)$chr_int))
+
+})
+
 test_that("gen_tibble from files",{
   ########################
   # PLINK BED files
@@ -244,18 +311,6 @@ test_that("gen_tibble from files with missingness",{
   expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt)))
   expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt)))
   expect_true(is.integer(show_loci(pop_b_vcf_gt2)$chr_int))
-
-  # check our cpp parser
-  pop_b_vcf_fast_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), parser="cpp")
-  expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_fast_gt)))
-  # check loci table against the vcfR parser
-  expect_true(all.equal(show_loci(pop_b_vcf_gt), show_loci(pop_b_vcf_fast_gt)))
-  # reload it in chunks
-  pop_b_vcf_fast_gt2 <- gen_tibble(vcf_path, quiet=TRUE, backingfile = tempfile(),
-                                   chunk_size = 2, parser="cpp")
-  expect_true(all.equal(show_genotypes(pop_b_vcf_fast_gt2),show_genotypes(pop_b_vcf_fast_gt)))
-  expect_true(all.equal(show_loci(pop_b_vcf_gt), show_loci(pop_b_vcf_fast_gt)))
-  expect_true(is.integer(show_loci(pop_b_vcf_fast_gt)$chr_int))
 })
 
 test_that("gentibble with packedancestry",{
@@ -576,11 +631,72 @@ test_that("additional vcf tests with larger file",{
 }
 )
 
+
+test_that("vcf's with haploid markers",{
+# read vcf with haploid markers first
+  vcf_path_haploid <- system.file("extdata/haploid_first_pop_a.vcf", package = "tidypopgen")
+  # the cpp parser catches the problem
+  expect_error(pop_a_vcf_gt_hap_cpp <- gen_tibble(vcf_path_haploid, quiet=TRUE,backingfile = tempfile(), parser="cpp"),
+               "a genotype")
+  # vcfR catches the problem
+  expect_error(pop_a_vcf_gt_hap_vcfR <- gen_tibble(vcf_path_haploid, quiet=TRUE,backingfile = tempfile(), parser="vcfR"),
+                  "a genotype")
+ # read vcf with haploid in the middle
+  vcf_path_haploid_middle <- system.file("extdata/haploid_middle_pop_a.vcf", package = "tidypopgen")
+  pop_a_vcf_gt_hapmid_cpp <- gen_tibble(vcf_path_haploid_middle, quiet=TRUE,backingfile = tempfile(), parser="cpp")
+  pop_a_vcf_gt_hapmid_vcfR <- gen_tibble(vcf_path_haploid_middle, quiet=TRUE,backingfile = tempfile(), parser="vcfR")
+
+  # vcfr reads correctly
+  expect_equal(show_genotypes(pop_a_vcf_gt_hapmid_vcfR)[,show_loci(pop_a_vcf_gt_hapmid_vcfR)$chromosome == 23], c(1,0,0,0,0))
+  # cpp reads correctly
+  expect_equal(show_genotypes(pop_a_vcf_gt_hapmid_cpp)[,show_loci(pop_a_vcf_gt_hapmid_cpp)$chromosome == 23], c(1,0,0,0,0))
+
+})
+
+test_that("chr_int is correct",{
+  # unit tests for the casting function
+  chromosome_names <- c("1", "2", NA, "4")
+  expect_true(identical(c(1L,2L,NA,4L), cast_chromosome_to_int(chromosome_names)))
+  chromosome_names <- c("chr1", "chr2", NA, "chr4")
+  expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names)))
+  chromosome_names <- c("a", "b", NA, "c")
+  expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names)))
+
+  # a real life example
+
+  # read bed
+  bed_path <- system.file("extdata/pop_b.bed", package = "tidypopgen")
+  pop_b_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile())
+
+  # chr_int is correct for bed files
+  expect_equal(show_loci(pop_b_gt)$chr_int, show_loci(pop_b_gt)$chromosome)
+
+  # read ped
+  ped_path <- system.file("extdata/pop_b.ped", package = "tidypopgen")
+  pop_b_ped_gt <- gen_tibble(ped_path, quiet=TRUE, backingfile = tempfile())
+
+  # chr_int is correct for ped files
+  expect_equal(show_loci(pop_b_ped_gt)$chr_int, show_loci(pop_b_ped_gt)$chromosome)
+
+  # read vcf
+  vcf_path <- system.file("extdata/pop_a.vcf", package = "tidypopgen")
+  pop_a_vcfr_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), parser = "vcfR")
+  pop_a_cpp_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), parser = "cpp")
+
+  # chr_int is correct for vcf
+  expect_equal(show_loci(pop_a_vcfr_gt)$chr_int, as.integer(show_loci(pop_a_vcfr_gt)$chromosome))
+  expect_equal(show_loci(pop_a_cpp_gt)$chr_int, as.integer(show_loci(pop_a_cpp_gt)$chromosome))
+
+})
+
+
 test_that("gen_tibble family.ID from vcf",{
 
+
   # If the gen_tibble has been read in from vcf format, family.ID in the resulting
   # plink files will be the same as sample.ID.
 
+
   ####  With vcfr parser
   vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen")
   pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
diff --git a/tests/testthat/test_loci_ld_clump.R b/tests/testthat/test_loci_ld_clump.R
index a62e636d..aa48fceb 100644
--- a/tests/testthat/test_loci_ld_clump.R
+++ b/tests/testthat/test_loci_ld_clump.R
@@ -92,30 +92,31 @@ test_that("loci order",{
 
   test_indiv_meta <- data.frame (id=c("a","b","c"),
                                  population = c("pop1","pop1","pop2"))
-  test_genotypes <- rbind(c(2,2,0,1,1,2),
-                          c(2,2,0,0,0,1),
-                          c(2,2,0,0,1,1))
+  test_genotypes <- rbind(c(1,2,2,0,1,2),
+                          c(0,2,2,0,0,1),
+                          c(1,2,2,0,0,1))
 
-  test_loci <- data.frame(name=paste0("rs",1:6),
-                          chromosome=c("chr2","chr1","chr1","chr1","chr2","chr1"),
-                          position=c(3,5,65,343,23,456),
+  test_loci <- data.frame(name=paste0("rs",c(5,rep(1:4,1),6)),
+                          chromosome=c("chr2","chr1","chr1","chr1","chr1","chr2"),
+                          position=c(23,3,5,65,343,456),
                           genetic_dist = as.double(rep(0,6)),
-                          allele_ref = c("A","T","C","G","C","T"),
-                          allele_alt = c("T","C", NA,"C","G","A"))
+                          allele_ref = c("C","A","T","C","G","T"),
+                          allele_alt = c("G","T","C", NA,"C","A"))
 
-  test_gt <- gen_tibble(x = test_genotypes, indiv_meta = test_indiv_meta, loci = test_loci,
+  test_gt_new_order <- gen_tibble(x = test_genotypes, indiv_meta = test_indiv_meta, loci = test_loci,
                         backingfile = tempfile(), quiet = TRUE)
 
-  #ld
-  expect_error(loci_ld_clump(test_gt, thr_r2 = 0.2), "Your loci are not sorted, try using:")
-  #reorder the loci
-  show_loci(test_gt) <- test_gt %>% show_loci() %>% arrange(chr_int,position)
-
-  show_loci(test_gt)
+  # clumping generates erro
+  expect_error(loci_ld_clump(test_gt_new_order, thr_r2 = 0.2), "Your loci are not sorted, try using:")
+  # reorder the loci
+  show_loci(test_gt_new_order) <- test_gt_new_order %>% show_loci() %>% arrange(chr_int,position)
+  # try again
+  keep_reordered <- loci_ld_clump(test_gt_new_order, thr_r2 = 0.2, return_id=TRUE)
 
+  # calculate the expected result
   keep <- loci_ld_clump(test_gt, thr_r2 = 0.2, return_id=TRUE)
-  expect_true(all.equal(keep, c(5, 1, 2, 3, 4)) == TRUE)
-
+  # compare
+  expect_equal(keep, keep_reordered)
 })
 
 test_that("loci_ld_clump works on a grouped gt",{