diff --git a/DESCRIPTION b/DESCRIPTION
index 3ec31e5c..48f53d78 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9017
+Version: 0.0.0.9018
 Authors@R: 
     c(person("Evie", "Carter", role = c("aut")),
     person("Andrea", "Manica", email = "am315@cam.ac.uk", 
diff --git a/NAMESPACE b/NAMESPACE
index 9089b068..24f96000 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl)
 S3method(gen_tibble,character)
 S3method(gen_tibble,matrix)
 S3method(group_by,gen_tbl)
-S3method(indiv_het_obs,grouped_df)
 S3method(indiv_het_obs,tbl_df)
 S3method(indiv_het_obs,vctrs_bigSNP)
-S3method(indiv_missingness,grouped_df)
 S3method(indiv_missingness,tbl_df)
 S3method(indiv_missingness,vctrs_bigSNP)
-S3method(indiv_ploidy,grouped_df)
 S3method(indiv_ploidy,tbl_df)
 S3method(indiv_ploidy,vctrs_bigSNP)
 S3method(loci_alt_freq,grouped_df)
@@ -124,6 +121,10 @@ export(pairwise_king)
 export(pairwise_pop_fst)
 export(pop_fis)
 export(pop_fst)
+export(pop_gene_div)
+export(pop_global_stats)
+export(pop_het_exp)
+export(pop_het_obs)
 export(q_matrix)
 export(qc_report_indiv)
 export(qc_report_loci)
diff --git a/R/gen_tibble.R b/R/gen_tibble.R
index b7efe37b..8c276811 100644
--- a/R/gen_tibble.R
+++ b/R/gen_tibble.R
@@ -37,6 +37,7 @@
 #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
 #' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
 #' an unusual structure.
+#' @param n_cores the number of cores to use for parallel processing
 #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
 #' 'C' and 'G'.
 #' @param missing_alleles a vector of values in the BIM file/loci dataframe that
@@ -76,6 +77,7 @@ gen_tibble.character <-
   function(x,
            ...,
            parser = c("vcfR","cpp"),
+           n_cores = 1,
            chunk_size = NULL,
            valid_alleles = c("A", "T", "C", "G"),
            missing_alleles = c("0","."),
@@ -107,6 +109,7 @@ gen_tibble.character <-
                        quiet = quiet)
   } else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
     return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size,
+                          n_cores = n_cores,
                    valid_alleles= valid_alleles,
                    missing_alleles= missing_alleles,
                    backingfile = backingfile, quiet = quiet))
diff --git a/R/gt_helper_functions.R b/R/gt_helper_functions.R
index bc58f58c..aa0bc217 100644
--- a/R/gt_helper_functions.R
+++ b/R/gt_helper_functions.R
@@ -10,23 +10,3 @@
 .gt_get_bigsnp<-function(.x){
   attr(.x$genotypes,"bigsnp")
 }
-
-
-# a developer function to create various count summaries of a population, used to
-# compute more complex statistics (e.g. pairwise fst, etc.).
-.gt_pop_freqs <- function(.x){
-  counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes,
-                                   ind.row =.gt_bigsnp_rows(.x),
-                                   ind.col = .gt_bigsnp_cols(.x))
-  sums_alt <- apply(counts,2,function(x) x[2]+2*x[3])
-  n <- apply(counts,2,function(x) sum(x[1:3])*2)
-  sums_ref <- n - sums_alt
-  freq_alt <- sums_alt/n
-  freq_ref <- 1- freq_alt
-  het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3]))
-  return (list(
-    freq_alt = freq_alt,
-    freq_ref = freq_ref,
-    n = n,
-    het_obs = het_obs))
-}
diff --git a/R/gt_roh_window.R b/R/gt_roh_window.R
index fc4d3d12..ae38bbb1 100644
--- a/R/gt_roh_window.R
+++ b/R/gt_roh_window.R
@@ -42,8 +42,8 @@
 #' @export
 #'
 #' @examples
-#' # don't run the example
-#' if (FALSE) {
+#' # run the example only if we have the package installed
+#' if (requireNamespace("detectRUNS", quietly = TRUE)) {
 #' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped",
 #'     package="detectRUNS")
 #' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(),
diff --git a/R/indiv_het_obs.R b/R/indiv_het_obs.R
index f8a11932..ac4522e0 100644
--- a/R/indiv_het_obs.R
+++ b/R/indiv_het_obs.R
@@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){
   this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,])
 }
 
-#' @export
-#' @rdname indiv_het_obs
-indiv_het_obs.grouped_df <- function(.x, ...){
-  .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
-}
+# #' @export
+# #' @rdname indiv_het_obs
+# indiv_het_obs.grouped_df <- function(.x, ...){
+#   .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
+# }
diff --git a/R/indiv_missingness.R b/R/indiv_missingness.R
index 91ec948b..a3c6e959 100644
--- a/R/indiv_missingness.R
+++ b/R/indiv_missingness.R
@@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){
   row_na
 }
 
-#' @export
-#' @rdname indiv_missingness
-indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
-  .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
-    summarise(het_obs = mean(indiv_missingness))
-}
+#' #' @export
+#' #' @rdname indiv_missingness
+#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
+#'   .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
+#'     summarise(het_obs = mean(indiv_missingness))
+#' }
diff --git a/R/indiv_ploidy.R b/R/indiv_ploidy.R
index ccedd88c..fc92fab7 100644
--- a/R/indiv_ploidy.R
+++ b/R/indiv_ploidy.R
@@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){
   }
 }
 
-#' @export
-#' @rdname indiv_ploidy
-indiv_ploidy.grouped_df <- function(.x, ...){
-  .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
-    summarise(ploidy = mean(indiv_ploidy))
-}
+#' #' @export
+#' #' @rdname indiv_ploidy
+#' indiv_ploidy.grouped_df <- function(.x, ...){
+#'   .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
+#'     summarise(ploidy = mean(indiv_ploidy))
+#' }
diff --git a/R/pop_fis.R b/R/pop_fis.R
index 268dcc9a..90275325 100644
--- a/R/pop_fis.R
+++ b/R/pop_fis.R
@@ -1,19 +1,89 @@
 #' Compute population specific FIS
 #'
-#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]).
+#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017
+#' (as computed by [hierfstat::fis.dosage()]).
+#' @references
+#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
-#' @param include_global boolean determining whether, besides the population specific fis, a global
-#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)
-#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by
+#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS
+#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE),
+#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87",
+#' as "WG17" always returns a single value.
+#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
+#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+#' or a matrix with n+1 columns if `by_locus=TRUE`.
+#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by
 #' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be
 #' derived from the Allele Sharing matrix,
 #' it it sometimes more efficient to pre-compute this matrix.
 #' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`)
 #' @export
 
-pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
+pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){
+  method <- match.arg(method)
+  if (method == "Nei87"){
+    if (!is.null(allele_sharing_mat)){
+      stop("allele_sharing_mat not relevant for Nei87")
+    }
+    pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global)
+  } else if (method == "WG17"){
+    if (by_locus){
+      stop("by_locus not implemented for WG17")
+    }
+    pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat)
+  }
+}
+
+pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global,
+                          n_cores = bigstatsr::nb_cores()){
+    stopifnot_diploid(.x)
+    # get the populations if it is a grouped gen_tibble
+    if (inherits(.x,"grouped_df")){
+      .group_levels <- .x %>% group_keys()
+      .group_ids <- dplyr::group_indices(.x)-1
+    } else { # create a dummy pop
+      .group_levels = tibble(population="pop")
+      .group_ids <- rep(0,nrow(.x))
+    }
+
+    # summarise population frequencies
+    pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
+                                         rowInd = .gt_bigsnp_rows(.x),
+                                         colInd = .gt_bigsnp_cols(.x),
+                                         groupIds = .group_ids,
+                                         ngroups = nrow(.group_levels),
+                                         ncores = n_cores)
+    sHo <-pop_freqs_df$het_obs
+    mHo <- apply(sHo, 1, mean, na.rm = TRUE)
+    n <-pop_freqs_df$n/2
+    # sum of squared frequencies
+    sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
+    Hs <- (1 - sp2 - sHo/2/n)
+    Hs <- n/(n - 1) * Hs
+    Fis = 1 - sHo/Hs
+
+    colnames(Fis)<- .group_levels %>% dplyr::pull(1)
+    if (by_locus){
+      if (include_global){
+        global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis
+        Fis <- cbind(Fis, global)
+      }
+    } else {
+      Fis <- colMeans(Fis, na.rm = TRUE)
+      if (include_global){
+         global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"]
+         names(global) <- "global"
+         Fis <- c(Fis, global)
+      }
+    }
+    return(Fis)
+}
+
+
+pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
   if (!inherits(.x,"grouped_df")){
-    stop (".x should be a grouped df")
+    stop (".x should be a grouped gen_tibble")
   }
   if (is.null(allele_sharing_mat)){
     allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
diff --git a/R/pop_fst.R b/R/pop_fst.R
index 99e84280..34758a8e 100644
--- a/R/pop_fst.R
+++ b/R/pop_fst.R
@@ -1,6 +1,8 @@
 #' Compute population specific Fst
 #'
-#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]).
+#' This function computes population specific Fst, using the approach in Weir and Goudet 2017
+#' (as computed by [hierfstat::fst.dosage()]).
+#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 #' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
 #' @param include_global boolean determining whether, besides the population specific Fst, a global
 #' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value)
@@ -12,7 +14,7 @@
 
 pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
   if (!inherits(.x,"grouped_df")){
-    stop (".x should be a grouped df")
+    stop (".x should be a grouped gen_tibble")
   }
   if (is.null(allele_sharing_mat)){
     allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
diff --git a/R/pop_global_stats.R b/R/pop_global_stats.R
new file mode 100644
index 00000000..9941c0a5
--- /dev/null
+++ b/R/pop_global_stats.R
@@ -0,0 +1,150 @@
+#' Compute basic population global statistics
+#'
+#' This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983):
+#' - observed heterozygosity ( \eqn{\hat{h}_o}, column header `Ho`)
+#' - expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, `Hs`)
+#' - total heterozygosity ( \eqn{\hat{h}_t}, `Ht`)
+#' - genetic differentiation between subpopulations (\eqn{D_{st}}, `Dst`)
+#' - corrected total population diversity (\eqn{h'_t}, `Htp`)
+#' - corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, `Dstp`)
+#' - \eqn{\hat{F}_{ST}} (column header, `Fst`)
+#' - corrected \eqn{\hat{F'}_{ST}} (column header `Fstp`)
+#' - \eqn{\hat{F}_{IS}} (column header, `Fis`)
+#' - Jost's \eqn{\hat{D}} (column header, `Dest`)
+#'
+#' @details We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so `m=2`.
+#' - Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr
+#'  \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m}  \hat{X}_{kii}/s}\cr
+#' where\cr
+#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr
+#' \eqn{s} the number of populations,\cr
+#' following equation 7.38 in Nei(1987) on pp.164.\cr
+#'
+#' - Within population expected heterozygosity (gene diversity)  \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr
+#'  \eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr
+#'  where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr
+#'  \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr
+#' following equation 7.39 in Nei(1987) on pp.164.
+#'
+#' - Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr
+#' \eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr
+#' where \cr
+#'  \eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr
+#' following equation 7.40 in Nei(1987) on pp.164.\cr
+#'
+#' - The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr
+#' \eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr
+#' following the equation provided in the text at the top of page 165 in Nei(1987).
+#'
+#' - The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr
+#' \eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr
+#' following the equation provided in the text at the top of page 165 in Nei(1987).
+#'
+#' - Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr
+#' \eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr
+#' following the equation provided in the text at the top of page 165 in Nei(1987).
+#' - \eqn{\hat{F}_{IS}} is defined as:\cr
+#' \eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr
+#' following equation 7.41 in Nei(1987) on pp.164.\cr
+#'
+#' - \eqn{\hat{F}_{ST}} is defined as:\cr
+#' \eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr
+#' following equation 7.43 in Nei(1987) on pp.165.\cr
+#'
+#' - \eqn{\hat{F'}_{ST}} is defined as:\cr
+#' \eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr
+#' following the explanation provided in the text at the top of page 165 in Nei(1987).
+#'
+#' - Jost's \eqn{\hat{D}} is defined as:\cr
+#' \eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr
+#' as defined by Jost(2008)
+#'
+#' All these statistics are first computed by locus, and then averaged across loci (including any
+#' monorphic locus) to obtain genome-wide values. The function uses the same algorithm as
+#' [hierfstat::basic.stats()] but is optimized for speed and memory usage.
+#'
+#' @references Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259.
+#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165.
+#' Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026.
+#'
+#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()]; use on
+#' a single population will return a number of quantities as NA/NaN)
+#' @param by_locus boolean, determining whether the statistics should be returned by locus(TRUE),
+#' or as a single genome wide value (FALSE, the default).
+#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()]
+#' @returns a tibble of population statistics, with populations as rows and statistics as columns
+#' @export
+
+
+# this code is adapted from hierfstat::basic.stats by Jerome Goudet
+pop_global_stats <- function(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()){
+  stopifnot_diploid(.x)
+  # get the populations if it is a grouped gen_tibble
+  if (inherits(.x,"grouped_df")){
+    .group_levels <- .x %>% group_keys()
+    .group_ids <- dplyr::group_indices(.x)-1
+  } else { # create a dummy pop
+    .group_levels = tibble(population="pop")
+    .group_ids <- rep(0,nrow(.x))
+  }
+
+  # summarise population frequencies
+  pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
+                                       rowInd = .gt_bigsnp_rows(.x),
+                                       colInd = .gt_bigsnp_cols(.x),
+                                       groupIds = .group_ids,
+                                       ngroups = nrow(.group_levels),
+                                       ncores = n_cores)
+
+  # get the number of individuals
+  n <-pop_freqs_df$n/2
+  # is this correct?
+  sHo <-pop_freqs_df$het_obs
+  mHo <- apply(sHo, 1, mean, na.rm = TRUE)
+
+  # sum of squared frequencies
+  sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
+#  Hs <- (1 - sp2 - sHo/2/n)
+#  Hs <- n/(n - 1) * Hs
+#  Fis = 1 - sHo/Hs
+
+  np <- apply(n, 1, fun <- function(x) sum(!is.na(x)))
+  # mean sample size over the populations
+  mn <- apply(n, 1, fun <- function(x) {
+    sum(!is.na(x))/sum(1/x[!is.na(x)])
+  })
+  # mean sum of square frequencies
+  msp2 <- apply(sp2, 1, mean, na.rm = TRUE)
+  mp2 <- rowMeans(pop_freqs_df$freq_alt)^2+rowMeans(pop_freqs_df$freq_ref)^2
+  mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn)
+  Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np
+  mFis = 1 - mHo/mHs
+
+  Dst <- Ht - mHs
+  Dstp <- np/(np - 1) * Dst
+  Htp = mHs + Dstp
+  Fst = Dst/Ht
+  Fstp = Dstp/Htp
+  Dest <- Dstp/(1 - mHs)
+  res <- data.frame(cbind(mHo, mHs, Ht, Dst, Htp, Dstp, Fst,
+                            Fstp, mFis, Dest))
+  names(res) <- c("Ho", "Hs", "Ht", "Dst", "Htp", "Dstp",
+                    "Fst", "Fstp", "Fis", "Dest")
+
+  if (by_locus){
+    return(res)
+  } else {
+    # overall summaries
+    is.na(res) <- do.call(cbind, lapply(res, is.infinite))
+    overall <- apply(res, 2, mean, na.rm = TRUE)
+    overall[7] <- overall[4]/overall[3]
+    overall[8] <- overall[6]/overall[5]
+    overall[9] <- 1 - overall[1]/overall[2]
+    overall[10] <- overall[6]/(1 - overall[2])
+    names(overall) <- names(res)
+    return(overall)
+  }
+}
+
+
+
diff --git a/R/pop_het_exp.R b/R/pop_het_exp.R
new file mode 100644
index 00000000..83aa5a7b
--- /dev/null
+++ b/R/pop_het_exp.R
@@ -0,0 +1,78 @@
+#' Compute the population expected heterozygosity
+#'
+#' This function computes expected population heterozygosity (also
+#' referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987).
+#'
+#' @details Within population expected heterozygosity (gene diversity)  \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr
+#'  \eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr
+#'
+#'  where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr
+#'  \eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr
+#' following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at
+#' the genome level for each population is simply the mean of the locus estimates for each population.
+#'
+#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+#'
+#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble
+#' will be considered as belonging to a single population).
+#' @param by_locus boolean, determining whether Hs should be returned by locus(TRUE),
+#' or as a single genome wide value (FALSE, the default).
+#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
+#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+#' or a matrix with n+1 columns if `by_locus=TRUE`.
+#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()]
+#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of
+#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`)
+#' @export
+
+
+# adapted from hierfstat
+pop_het_exp <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){
+  stopifnot_diploid(.x)
+  # get the populations if it is a grouped gen_tibble
+  if (inherits(.x,"grouped_df")){
+    .group_levels <- .x %>% group_keys()
+    .group_ids <- dplyr::group_indices(.x)-1
+  } else { # create a dummy pop
+    .group_levels = tibble(population="pop")
+    .group_ids <- rep(0,nrow(.x))
+  }
+
+  # summarise population frequencies
+  pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
+                                       rowInd = .gt_bigsnp_rows(.x),
+                                       colInd = .gt_bigsnp_cols(.x),
+                                       groupIds = .group_ids,
+                                       ngroups = nrow(.group_levels),
+                                       ncores = n_cores)
+  sHo <-pop_freqs_df$het_obs
+  mHo <- apply(sHo, 1, mean, na.rm = TRUE)
+  n <-pop_freqs_df$n/2
+  # sum of squared frequencies
+  sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
+  Hs <- (1 - sp2 - sHo/2/n)
+  Hs <- n/(n - 1) * Hs
+
+  colnames(Hs)<- .group_levels %>% dplyr::pull(1)
+  if (include_global){
+    global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Hs
+    Hs <- cbind(Hs, global)
+  }
+
+  if (by_locus){
+    return(Hs)
+  } else {
+    return(colMeans(Hs, na.rm = TRUE))
+  }
+}
+
+
+# alias for gene diversity
+#' @export
+#' @rdname pop_het_exp
+pop_gene_div <- function (.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){
+  pop_het_exp(.x=.x,
+              by_locus = by_locus,
+              include_global = include_global,
+              n_cores = n_cores)
+}
diff --git a/R/pop_het_obs.R b/R/pop_het_obs.R
new file mode 100644
index 00000000..08b64408
--- /dev/null
+++ b/R/pop_het_obs.R
@@ -0,0 +1,61 @@
+#' Compute the population observed heterozygosity
+#'
+#' This function computes population heterozygosity, using the formula of Nei (1987).
+#'
+#' @details Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr
+#'  \eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m}  \hat{X}_{kii}/s}\cr
+#' where\cr
+#' \eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr
+#' \eqn{s} the number of populations,\cr
+#' following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For
+#' population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at
+#' the genome level is simply the mean of the locus estimates.
+#'
+#' @references Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+#'
+#' @param .x a [`gen_tibble`] (usually grouped, as obtained by using [dplyr::group_by()], otherwise the full tibble
+#' will be considered as belonging to a single population).
+#' @param by_locus boolean, determining whether Ho should be returned by locus(TRUE),
+#' or as a single genome wide value (FALSE, the default).
+#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
+#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+#' or a matrix with n+1 columns if `by_locus=TRUE`.
+#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()]
+#' @returns a vector of mean population observed heterozygosities (if `by_locus=FALSE`), or a matrix of
+#' estimates by locus (rows are loci, columns are populations, `by_locus=TRUE`)
+#' @export
+
+
+# adapted from hierfstat
+pop_het_obs <- function(.x, by_locus = FALSE, include_global = FALSE, n_cores = bigstatsr::nb_cores()){
+  stopifnot_diploid(.x)
+  # get the populations if it is a grouped gen_tibble
+  if (inherits(.x,"grouped_df")){
+    .group_levels <- .x %>% group_keys()
+    .group_ids <- dplyr::group_indices(.x)-1
+  } else { # create a dummy pop
+    .group_levels = tibble(population="pop")
+    .group_ids <- rep(0,nrow(.x))
+  }
+
+  # summarise population frequencies
+  pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
+                                       rowInd = .gt_bigsnp_rows(.x),
+                                       colInd = .gt_bigsnp_cols(.x),
+                                       groupIds = .group_ids,
+                                       ngroups = nrow(.group_levels),
+                                       ncores = n_cores)
+
+  Ho <- pop_freqs_df$het_obs
+  colnames(Ho)<- .group_levels %>% dplyr::pull(1)
+  if (include_global){
+    global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Ho
+    Ho <- cbind(Ho, global)
+  }
+
+  if (by_locus){
+    return(Ho)
+  } else {
+    return(colMeans(Ho, na.rm = TRUE))
+  }
+}
diff --git a/R/utils.R b/R/utils.R
index ed5e5f70..20a6b2da 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -24,6 +24,9 @@ tidy_dist_matrix<- function(mat){
 
 # stop if not diploid
 stopifnot_diploid <- function(x){
+  if (inherits(x, "gen_tbl")){
+    x <- x$genotypes
+  }
   if (attr(x,"ploidy")!=2){
     stop("this function only works on diploid data")
   }
diff --git a/R/vcf_to_fbm_cpp.R b/R/vcf_to_fbm_cpp.R
index cdf1f60f..c1ee1167 100644
--- a/R/vcf_to_fbm_cpp.R
+++ b/R/vcf_to_fbm_cpp.R
@@ -4,8 +4,9 @@
 #' This should work even for large vcf files that would not fit in memory.
 #'
 #' @param vcf_path the path to the vcf
-#' @param chunk_size the chunk size to use on the vcf when loading the file
-#' @param backingfile the name of the file to use as the backing file
+#' @param chunk_size the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made.
+#' @param backingfile the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used.
+#' @param n_cores the number of cores to use when reading the vcf file. Default is 1.
 #' @return path to the resulting rds file as class bigSNP.
 #' @keywords internal
 
@@ -13,10 +14,9 @@ vcf_to_fbm_cpp <- function(
     vcf_path,
     chunk_size = NULL,
     backingfile = NULL,
+    n_cores = 1,
     quiet=FALSE) {
 
-  n_cores <- 3
-
   if (is.null(backingfile)){
     backingfile <- vcf_path
     backingfile <- sub("\\.vcf.gz$", "", backingfile)
diff --git a/README.md b/README.md
index bb11a444..bf8e1727 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 <!-- badges: start -->
 [![R-CMD-check](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/EvolEcolGroup/tidypopgen/actions/workflows/R-CMD-check.yaml)
+[![codecov](https://codecov.io/gh/EvolEcolGroup/tidypopgen/branch/main/graph/badge.svg?token=KLOzxJoLBO)](https://app.codecov.io/gh/EvolEcolGroup/tidypopgen)
 <!-- badges: end -->
 
 The goal of `tidypopgen` is to provide a tidy grammar of population genetics, facilitating 
diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd
index b86be381..b1dd9fd5 100644
--- a/man/gen_tibble.Rd
+++ b/man/gen_tibble.Rd
@@ -19,6 +19,7 @@ gen_tibble(
   x,
   ...,
   parser = c("vcfR", "cpp"),
+  n_cores = 1,
   chunk_size = NULL,
   valid_alleles = c("A", "T", "C", "G"),
   missing_alleles = c("0", "."),
@@ -83,6 +84,8 @@ a fast C++ parser, or "vcfR" to use the R package \code{vcfR}. The latter is slo
 but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
 an unusual structure.}
 
+\item{n_cores}{the number of cores to use for parallel processing}
+
 \item{chunk_size}{the number of loci or individuals (depending on the format)
 processed at a time (currently used
 if \code{x} is a vcf or packedancestry file)}
diff --git a/man/gt_roh_window.Rd b/man/gt_roh_window.Rd
index a569ba0b..441d8801 100644
--- a/man/gt_roh_window.Rd
+++ b/man/gt_roh_window.Rd
@@ -74,8 +74,8 @@ group table. Otherwise, the group 'column' is filled with the same values as the
 column
 }
 \examples{
-# don't run the example
-if (FALSE) {
+# run the example only if we have the package installed
+if (requireNamespace("detectRUNS", quietly = TRUE)) {
 sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped",
     package="detectRUNS")
 sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(),
diff --git a/man/indiv_het_obs.Rd b/man/indiv_het_obs.Rd
index ff5da3dd..4c6c6cfb 100644
--- a/man/indiv_het_obs.Rd
+++ b/man/indiv_het_obs.Rd
@@ -4,7 +4,6 @@
 \alias{indiv_het_obs}
 \alias{indiv_het_obs.tbl_df}
 \alias{indiv_het_obs.vctrs_bigSNP}
-\alias{indiv_het_obs.grouped_df}
 \title{Estimate individual observed heterozygosity}
 \usage{
 indiv_het_obs(.x, ...)
@@ -12,8 +11,6 @@ indiv_het_obs(.x, ...)
 \method{indiv_het_obs}{tbl_df}(.x, ...)
 
 \method{indiv_het_obs}{vctrs_bigSNP}(.x, ...)
-
-\method{indiv_het_obs}{grouped_df}(.x, ...)
 }
 \arguments{
 \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of
diff --git a/man/indiv_missingness.Rd b/man/indiv_missingness.Rd
index 140f610f..ebec8198 100644
--- a/man/indiv_missingness.Rd
+++ b/man/indiv_missingness.Rd
@@ -4,7 +4,6 @@
 \alias{indiv_missingness}
 \alias{indiv_missingness.tbl_df}
 \alias{indiv_missingness.vctrs_bigSNP}
-\alias{indiv_missingness.grouped_df}
 \title{Estimate individual missingness}
 \usage{
 indiv_missingness(.x, as_counts = FALSE, ...)
@@ -12,8 +11,6 @@ indiv_missingness(.x, as_counts = FALSE, ...)
 \method{indiv_missingness}{tbl_df}(.x, as_counts = FALSE, ...)
 
 \method{indiv_missingness}{vctrs_bigSNP}(.x, as_counts = FALSE, ...)
-
-\method{indiv_missingness}{grouped_df}(.x, as_counts = FALSE, ...)
 }
 \arguments{
 \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of
diff --git a/man/indiv_ploidy.Rd b/man/indiv_ploidy.Rd
index f67ba4e3..7210b697 100644
--- a/man/indiv_ploidy.Rd
+++ b/man/indiv_ploidy.Rd
@@ -4,7 +4,6 @@
 \alias{indiv_ploidy}
 \alias{indiv_ploidy.tbl_df}
 \alias{indiv_ploidy.vctrs_bigSNP}
-\alias{indiv_ploidy.grouped_df}
 \title{Return individual ploidy}
 \usage{
 indiv_ploidy(.x, ...)
@@ -12,8 +11,6 @@ indiv_ploidy(.x, ...)
 \method{indiv_ploidy}{tbl_df}(.x, ...)
 
 \method{indiv_ploidy}{vctrs_bigSNP}(.x, ...)
-
-\method{indiv_ploidy}{grouped_df}(.x, ...)
 }
 \arguments{
 \item{.x}{a \code{\link{gen_tibble}}, or a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of
diff --git a/man/pop_fis.Rd b/man/pop_fis.Rd
index 0c41ba4c..5a162189 100644
--- a/man/pop_fis.Rd
+++ b/man/pop_fis.Rd
@@ -4,15 +4,28 @@
 \alias{pop_fis}
 \title{Compute population specific FIS}
 \usage{
-pop_fis(.x, include_global = FALSE, allele_sharing_mat = NULL)
+pop_fis(
+  .x,
+  method = c("Nei87", "WG17"),
+  by_locus = FALSE,
+  include_global = FALSE,
+  allele_sharing_mat = NULL
+)
 }
 \arguments{
 \item{.x}{a grouped \code{\link{gen_tibble}} (as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}})}
 
-\item{include_global}{boolean determining whether, besides the population specific fis, a global
-fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)}
+\item{method}{one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS}
 
-\item{allele_sharing_mat}{optional, the matrix of Allele Sharing returned by
+\item{by_locus}{boolean, determining whether FIS should be returned by locus(TRUE),
+or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87",
+as "WG17" always returns a single value.}
+
+\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global
+estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+or a matrix with n+1 columns if \code{by_locus=TRUE}.}
+
+\item{allele_sharing_mat}{optional and only relevant for "WG17", the matrix of Allele Sharing returned by
 \code{\link[=pairwise_allele_sharing]{pairwise_allele_sharing()}} with \code{as_matrix=TRUE}. As a number of statistics can be
 derived from the Allele Sharing matrix,
 it it sometimes more efficient to pre-compute this matrix.}
@@ -21,5 +34,10 @@ it it sometimes more efficient to pre-compute this matrix.}
 a vector of population specific fis (plus the global value if \code{include_global=TRUE})
 }
 \description{
-This function computes population specific FIS (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}).
+This function computes population specific FIS, using either the approach of Nei 1987 (as computed by \code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}}) or of Weir and Goudet 2017
+(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fis.dosage()}}).
+}
+\references{
+Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 }
diff --git a/man/pop_fst.Rd b/man/pop_fst.Rd
index f606c26c..a432aff1 100644
--- a/man/pop_fst.Rd
+++ b/man/pop_fst.Rd
@@ -20,5 +20,9 @@ derived from the Allele Sharing matrix,}
 a vector of population specific Fst (plus the global value if \code{include_global=TRUE})
 }
 \description{
-This function computes population specific Fst (as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}).
+This function computes population specific Fst, using the approach in Weir and Goudet 2017
+(as computed by \code{\link[hierfstat:fs.dosage]{hierfstat::fst.dosage()}}).
+}
+\references{
+Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
 }
diff --git a/man/pop_global_stats.Rd b/man/pop_global_stats.Rd
new file mode 100644
index 00000000..5449f96a
--- /dev/null
+++ b/man/pop_global_stats.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pop_global_stats.R
+\name{pop_global_stats}
+\alias{pop_global_stats}
+\title{Compute basic population global statistics}
+\usage{
+pop_global_stats(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores())
+}
+\arguments{
+\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}; use on
+a single population will return a number of quantities as NA/NaN)}
+
+\item{by_locus}{boolean, determining whether the statistics should be returned by locus(TRUE),
+or as a single genome wide value (FALSE, the default).}
+
+\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}}
+}
+\value{
+a tibble of population statistics, with populations as rows and statistics as columns
+}
+\description{
+This function computes basic population global statistics, following the notation in Nei 1987 (which in turn is based on Nei and Chesser 1983):
+\itemize{
+\item observed heterozygosity ( \eqn{\hat{h}_o}, column header \code{Ho})
+\item expected heterozygosity, also known as gene diversity ( \eqn{\hat{h}_s}, \code{Hs})
+\item total heterozygosity ( \eqn{\hat{h}_t}, \code{Ht})
+\item genetic differentiation between subpopulations (\eqn{D_{st}}, \code{Dst})
+\item corrected total population diversity (\eqn{h'_t}, \code{Htp})
+\item corrected genetic differentiation between subpopulations (\eqn{D'_{st}}, \code{Dstp})
+\item \eqn{\hat{F}_{ST}} (column header, \code{Fst})
+\item corrected \eqn{\hat{F'}_{ST}} (column header \code{Fstp})
+\item \eqn{\hat{F}_{IS}} (column header, \code{Fis})
+\item Jost's \eqn{\hat{D}} (column header, \code{Dest})
+}
+}
+\details{
+We use the notation of Nei 1987. That notation was for loci with \eqn{m} alleles, but in our case we only have two alleles, so \code{m=2}.
+\itemize{
+\item Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr
+\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m}  \hat{X}_{kii}/s}\cr
+where\cr
+\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr
+\eqn{s} the number of populations,\cr
+following equation 7.38 in Nei(1987) on pp.164.\cr
+\item Within population expected heterozygosity (gene diversity)  \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr
+\eqn{\hat{h}_s=(\tilde{n}/(\tilde{n}-1))[1-\sum_{i=1}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr
+where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr
+\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr
+following equation 7.39 in Nei(1987) on pp.164.
+\item Total heterozygosity (total gene diversity) \eqn{\hat{h}_t} for a locus with \eqn{m} alleles is defined as:\cr
+\eqn{\hat{h}_t = 1-\sum_{i=1}^{m} \bar{\hat{x}_i^2} + \hat{h}_s/(\tilde{n}s) - \hat{h}_o/(2\tilde{n}s)}\cr
+where \cr
+\eqn{\hat{x}_i=\sum_k \hat{x}_{ki}/s}\cr
+following equation 7.40 in Nei(1987) on pp.164.\cr
+\item The amount of gene diversity among samples \eqn{D_{ST}} is defined as:\cr
+\eqn{D_{ST} = \hat{h}_t - \hat{h}_s}\cr
+following the equation provided in the text at the top of page 165 in Nei(1987).
+\item The corrected amount of gene diversity among samples \eqn{D'_{ST}} is defined as:\cr
+\eqn{D'_{ST} = (s/(s-1))D'_{ST}}\cr
+following the equation provided in the text at the top of page 165 in Nei(1987).
+\item Total corrected heterozygosity (total gene diversity) \eqn{\hat{h}_t} is defined as:\cr
+\eqn{\hat{h'}_t = \hat{h}_s + D'_{ST}}\cr
+following the equation provided in the text at the top of page 165 in Nei(1987).
+\item \eqn{\hat{F}_{IS}} is defined as:\cr
+\eqn{\hat{F}_{IS} = 1 - \hat{h}_o/\hat{h}_s}\cr
+following equation 7.41 in Nei(1987) on pp.164.\cr
+\item \eqn{\hat{F}_{ST}} is defined as:\cr
+\eqn{\hat{F}_{ST} = 1 - \hat{h}_s/\hat{h}_t = D_{ST}/\hat{h}_t}\cr
+following equation 7.43 in Nei(1987) on pp.165.\cr
+\item \eqn{\hat{F'}_{ST}} is defined as:\cr
+\eqn{\hat{F'}_{ST} = D'_{ST}/\hat{h'}_t}\cr
+following the explanation provided in the text at the top of page 165 in Nei(1987).
+\item Jost's \eqn{\hat{D}} is defined as:\cr
+\eqn{\hat{D} = (s/(s-1))((\hat{h'}_t-\hat{h}_s)/(1-\hat{h}_s))}\cr
+as defined by Jost(2008)
+}
+
+All these statistics are first computed by locus, and then averaged across loci (including any
+monorphic locus) to obtain genome-wide values. The function uses the same algorithm as
+\code{\link[hierfstat:basic.stats]{hierfstat::basic.stats()}} but is optimized for speed and memory usage.
+}
+\references{
+Nei M, Chesser R (1983) Estimation of fixation indexes and gene diversities. Annals of Human Genetics, 47, 253-259.
+Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press, pp. 164-165.
+Jost L (2008) GST and its relatives do not measure differentiation. Molecular Ecology, 17, 4015-4026.
+}
diff --git a/man/pop_het_exp.Rd b/man/pop_het_exp.Rd
new file mode 100644
index 00000000..3d674f53
--- /dev/null
+++ b/man/pop_het_exp.Rd
@@ -0,0 +1,54 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pop_het_exp.R
+\name{pop_het_exp}
+\alias{pop_het_exp}
+\alias{pop_gene_div}
+\title{Compute the population expected heterozygosity}
+\usage{
+pop_het_exp(
+  .x,
+  by_locus = FALSE,
+  include_global = FALSE,
+  n_cores = bigstatsr::nb_cores()
+)
+
+pop_gene_div(
+  .x,
+  by_locus = FALSE,
+  include_global = FALSE,
+  n_cores = bigstatsr::nb_cores()
+)
+}
+\arguments{
+\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble
+will be considered as belonging to a single population).}
+
+\item{by_locus}{boolean, determining whether Hs should be returned by locus(TRUE),
+or as a single genome wide value (FALSE, the default).}
+
+\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global
+estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+or a matrix with n+1 columns if \code{by_locus=TRUE}.}
+
+\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}}
+}
+\value{
+a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of
+estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE})
+}
+\description{
+This function computes expected population heterozygosity (also
+referred to as gene diversity, to avoid the potentially misleading use of the term "expected" in this context), using the formula of Nei (1987).
+}
+\details{
+Within population expected heterozygosity (gene diversity)  \eqn{\hat{h}_s} for a locus with \eqn{m} alleles is defined as:\cr
+\eqn{\hat{h}_s=\tilde{n}/(\tilde{n}-1)[1-\sum_{i}^{m}\bar{\hat{x}_i^2}-\hat{h}_o/2\tilde{n}]}\cr
+
+where \cr \eqn{\tilde{n}=s/\sum_k 1/n_k} (i.e the harmonic mean of \eqn{n_k}) and\cr
+\eqn{\bar{\hat{x}_i^2}=\sum_k \hat{x}_{ki}^2/s}\cr
+following equation 7.39 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. \eqn{\hat{h}_s} at
+the genome level for each population is simply the mean of the locus estimates for each population.
+}
+\references{
+Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+}
diff --git a/man/pop_het_obs.Rd b/man/pop_het_obs.Rd
new file mode 100644
index 00000000..c5848578
--- /dev/null
+++ b/man/pop_het_obs.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pop_het_obs.R
+\name{pop_het_obs}
+\alias{pop_het_obs}
+\title{Compute the population observed heterozygosity}
+\usage{
+pop_het_obs(
+  .x,
+  by_locus = FALSE,
+  include_global = FALSE,
+  n_cores = bigstatsr::nb_cores()
+)
+}
+\arguments{
+\item{.x}{a \code{\link{gen_tibble}} (usually grouped, as obtained by using \code{\link[dplyr:group_by]{dplyr::group_by()}}, otherwise the full tibble
+will be considered as belonging to a single population).}
+
+\item{by_locus}{boolean, determining whether Ho should be returned by locus(TRUE),
+or as a single genome wide value (FALSE, the default).}
+
+\item{include_global}{boolean determining whether, besides the population specific estiamtes, a global
+estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
+or a matrix with n+1 columns if \code{by_locus=TRUE}.}
+
+\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}}
+}
+\value{
+a vector of mean population observed heterozygosities (if \code{by_locus=FALSE}), or a matrix of
+estimates by locus (rows are loci, columns are populations, \code{by_locus=TRUE})
+}
+\description{
+This function computes population heterozygosity, using the formula of Nei (1987).
+}
+\details{
+Within population observed heterozygosity \eqn{\hat{h}_o} for a locus with \eqn{m} alleles is defined as:\cr
+\eqn{\hat{h}_o= 1-\sum_{k=1}^{s} \sum_{i=1}^{m}  \hat{X}_{kii}/s}\cr
+where\cr
+\eqn{\hat{X}_{kii}} represents the proportion of homozygote \eqn{i} in the sample for the \eqn{k}th population and\cr
+\eqn{s} the number of populations,\cr
+following equation 7.38 in Nei(1987) on pp.164. In our specific case, there are only two alleles, so \eqn{m=2}. For
+population specific estimates, the sum is done over a single value of \eqn{k}. \eqn{\hat{h}_o} at
+the genome level is simply the mean of the locus estimates.
+}
+\references{
+Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
+}
diff --git a/man/vcf_to_fbm_cpp.Rd b/man/vcf_to_fbm_cpp.Rd
index 3e5878a4..ee02aeb9 100644
--- a/man/vcf_to_fbm_cpp.Rd
+++ b/man/vcf_to_fbm_cpp.Rd
@@ -4,14 +4,22 @@
 \alias{vcf_to_fbm_cpp}
 \title{Convert vcf to FBM.}
 \usage{
-vcf_to_fbm_cpp(vcf_path, chunk_size = NULL, backingfile = NULL, quiet = FALSE)
+vcf_to_fbm_cpp(
+  vcf_path,
+  chunk_size = NULL,
+  backingfile = NULL,
+  n_cores = 1,
+  quiet = FALSE
+)
 }
 \arguments{
 \item{vcf_path}{the path to the vcf}
 
-\item{chunk_size}{the chunk size to use on the vcf when loading the file}
+\item{chunk_size}{the chunk size to use on the vcf when loading the file. If NULL, a best guess will be made.}
 
-\item{backingfile}{the name of the file to use as the backing file}
+\item{backingfile}{the name of the file to use as the backing file for the FBM. If NULL, the vcf path will be used.}
+
+\item{n_cores}{the number of cores to use when reading the vcf file. Default is 1.}
 }
 \value{
 path to the resulting rds file as class bigSNP.
diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R
index a76451bb..51c55148 100644
--- a/tests/testthat/test_gen_tibble.R
+++ b/tests/testthat/test_gen_tibble.R
@@ -190,7 +190,7 @@ test_that("gen_tibble from files",{
   expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt)))
   # reload it in chunks
   pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
-                              chunk_size = 2)
+                              chunk_size = 2, parser="vcfR")
   expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt)))
   expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt)))
   expect_true(is.integer(show_loci(pop_a_vcf_gt)$chr_int))
@@ -235,11 +235,12 @@ test_that("gen_tibble from files with missingness",{
   # PLINK VCF files
   ########################
   vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen")
-  pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile())
+  pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
+                             parser="vcfR")
   expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt)))
   # reload it in chunks
   pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
-                              chunk_size = 2)
+                              chunk_size = 2, parser = "vcfR")
   expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt)))
   expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt)))
   expect_true(is.integer(show_loci(pop_b_vcf_gt2)$chr_int))
@@ -549,11 +550,11 @@ test_that("gt without population is valid",{
   gt_load(file_names[1])
 
   # will fail in functions that require grouping
-  expect_error(pop_fis(test_gt), ".x should be a grouped df")
+  expect_error(pop_fst(test_gt), ".x should be a grouped gen_tibble")
   # functions that require grouping work after adding groups
   test_gt$groups <- c("A","A","B")
   test_gt <- test_gt %>% group_by(groups)
-  expect_equal(unname(pop_fis(test_gt)), c(-0.25, NaN))
+  expect_equal(unname(pop_fst(test_gt)), c(-0, NaN))
 
   # gen_tbl from vcf does not have population automatically
   vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen")
@@ -562,6 +563,21 @@ test_that("gt without population is valid",{
 })
 
 
+test_that("additional vcf tests with larger file",{
+  vcf_path <- system.file("/extdata/anolis/punctatus_t70_s10_n46_filtered.recode.vcf.gz",
+                          package = "tidypopgen")
+  anole_gt <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_"))
+  anole_gt_vcfr <- gen_tibble(vcf_path, quiet = TRUE, parser = "vcfR",
+                              backingfile = tempfile("anolis_"))
+  expect_true(all.equal(show_genotypes(anole_gt),show_genotypes(anole_gt_vcfr)))
+  anole_gt2 <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_"),
+                          chunk_size = 1000, n_cores = 2)
+  expect_true(all.equal(show_genotypes(anole_gt2),show_genotypes(anole_gt_vcfr)))
+}
+)
+
+
+
 # Windows prevents the deletion of the backing file. It's something to do with the memory mapping
 # library used by bigsnpr
 # test_that("on error, we remove the old files",{
diff --git a/tests/testthat/test_pop_basic_stats.R b/tests/testthat/test_pop_basic_stats.R
new file mode 100644
index 00000000..78af26a1
--- /dev/null
+++ b/tests/testthat/test_pop_basic_stats.R
@@ -0,0 +1,88 @@
+test_genotypes <- rbind(c(1,1,0,1,1,0),
+                        c(2,1,0,NA,0,0),
+                        c(2,NA,0,0,1,1),
+                        c(1,0,0,1,0,0),
+                        c(1,2,0,1,2,1),
+                        c(0,0,0,0,NA,1),
+                        c(0,1,1,0,1,NA))
+test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"),
+                               population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3"))
+test_loci <- data.frame(name=paste0("rs",1:6),
+                        chromosome=paste0("chr",c(1,1,1,1,2,2)),
+                        position=as.integer(c(3,5,65,343,23,456)),
+                        genetic_dist = as.integer(rep(0,6)),
+                        allele_ref = c("A","T","C","G","C","T"),
+                        allele_alt = c("T","C", NA,"C","G","A"))
+
+test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
+
+test_that("pop_* basic stats functions work correctly",{
+  test_gt <- test_gt %>% dplyr::group_by(population)
+  test_hier <- gt_as_hierfstat(test_gt)
+  basic_hier <- hierfstat::basic.stats(test_hier)
+
+  # observed heterozygosity by locus
+  test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE)
+  expect_true(all.equal(basic_hier$Ho, round(test_het_obs, 4), check.attributes=FALSE))
+  # overall (mean of by locus values)
+  test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE)
+  expect_true(all.equal(colMeans(basic_hier$Ho, na.rm=TRUE), test_het_obs, check.attributes=FALSE))
+  # check the overall value
+  test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE, include_global = TRUE)
+  expect_true(all.equal(round(test_het_obs[,4], 4), basic_hier$perloc$Ho, check.attributes=FALSE))
+  test_het_obs <- test_gt %>% pop_het_obs(by_locus = FALSE, include_global = TRUE)
+  expect_true(all.equal(round(test_het_obs[4], 4), basic_hier$overall["Ho"], check.attributes=FALSE))
+
+  # test expected heterozygosity by locus
+  test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE)
+  expect_true(all.equal(basic_hier$Hs, round(test_het_exp, 4), check.attributes=FALSE))
+  # overall (mean of by locus values)
+  test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE)
+  expect_true(all.equal(colMeans(basic_hier$Hs, na.rm=TRUE), test_het_exp, check.attributes=FALSE))
+  # check the overall value
+  test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE, include_global = TRUE)
+  expect_true(all.equal(round(test_het_exp[,4], 4), basic_hier$perloc$Hs, check.attributes=FALSE))
+  test_het_exp <- test_gt %>% pop_het_exp(by_locus = FALSE, include_global = TRUE)
+  expect_true(all.equal(round(test_het_exp[4], 4), basic_hier$overall["Hs"], check.attributes=FALSE))
+
+  # test Fis by locus
+  test_fis <- test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87")
+  expect_true(all.equal(basic_hier$Fis, round(test_fis, 4), check.attributes=FALSE))
+  # overall (mean of by locus values)
+  test_fis <- test_gt %>% pop_fis(by_locus = FALSE)
+  expect_true(all.equal(round(colMeans(basic_hier$Fis, na.rm=TRUE),4), round(test_fis,4), check.attributes=FALSE))
+  # check the overall value
+  test_fis <- test_gt %>% pop_fis(by_locus = TRUE, include_global = TRUE)
+  expect_true(all.equal(round(test_fis[,4], 4), basic_hier$perloc$Fis, check.attributes=FALSE))
+  test_fis <- test_gt %>% pop_fis(by_locus = FALSE, include_global = TRUE)
+  expect_true(all.equal(round(test_fis[4], 4), basic_hier$overall["Fis"], check.attributes=FALSE))
+
+  # test global stats by locus
+  test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE)
+  expect_true(all.equal(basic_hier$perloc, round(test_global_stats, 4), check.attributes=FALSE))
+  # test global stats overall
+  test_global_stats <- test_gt %>% pop_global_stats(by_locus = FALSE)
+  expect_true(all.equal(basic_hier$overall, round(test_global_stats, 4), check.attributes=FALSE))
+
+  # test Fis errors for incorrect parameters for Nei87
+  expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "Nei87", allele_sharing_mat = matrix(1, nrow=7, ncol=7)),
+               "allele_sharing_mat not relevant for Nei87")
+  # test Fis errors for incorrect parameters for WG17
+  expect_error(test_gt %>% pop_fis(by_locus = TRUE, method= "WG17", allele_sharing_mat = matrix(1, nrow=7, ncol=7)),
+               "by_locus not implemented for WG17")
+  # test alias for pop_het_exp
+  expect_identical(test_het_exp, test_gt %>% pop_gene_div(by_locus = FALSE, include_global = TRUE))
+
+})
+
+test_that("pop_* basic stats work on a single population",{
+  test_het_obs <- test_gt %>% pop_het_obs(by_locus = TRUE)
+  expect_true(ncol(test_het_obs) == 1) # just one column, as we only have one population
+  test_fis <- test_gt %>% pop_fis(by_locus = TRUE)
+  expect_true(ncol(test_fis) == 1) # just one column, as we only have one population
+  test_het_exp <- test_gt %>% pop_het_exp(by_locus = TRUE)
+  expect_true(ncol(test_het_exp) == 1) # just one column, as we only have one population
+  test_global_stats <- test_gt %>% pop_global_stats(by_locus = TRUE)
+  expect_true(all(is.nan(test_global_stats$Fstp))) # Fstp is not defined for a single population
+
+})
diff --git a/tests/testthat/test_pop_fis_fst.R b/tests/testthat/test_pop_fis_fst.R
index 017dfdfd..48b910f9 100644
--- a/tests/testthat/test_pop_fis_fst.R
+++ b/tests/testthat/test_pop_fis_fst.R
@@ -17,19 +17,25 @@ test_loci <- data.frame(name=paste0("rs",1:6),
 test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)
 
 
-test_that("pop_fst and pop_fist compute correctly",{
+test_that("pop_fst and pop_fist WG17 compute correctly",{
+  # expect error if the tibble is not grouped
+  expect_error(test_gt %>% pop_fis(method="WG17"),
+               ".x should be a grouped gen_tibble")
+
+  expect_error(test_gt %>% pop_fst(),
+               ".x should be a grouped gen_tibble")
+
+  # group the tibble
   test_gt <- test_gt %>% dplyr::group_by(population)
 
   # compare results against raw hierfstat code
-  fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE)
+  fis_by_pop <- test_gt %>% pop_fis(include_global=TRUE, method = "WG17")
   fis_by_pop_hier <- hierfstat::fis.dosage(test_genotypes,pop=test_indiv_meta$population)
   expect_true(all.equal(fis_by_pop, fis_by_pop_hier,check.attributes = FALSE))
   # now check that we don't get the global
-  fis_by_pop_sub <- test_gt %>% pop_fis()
+  fis_by_pop_sub <- test_gt %>% pop_fis(method = "WG17")
   expect_true (all.equal(fis_by_pop[-length(fis_by_pop)], fis_by_pop_sub))
 
-
-
   # compare results against raw hierfstat code
   fst_by_pop <- test_gt %>% pop_fst(include_global=TRUE)
   fst_by_pop_hier <- hierfstat::fst.dosage(test_genotypes,pop=test_indiv_meta$population)
diff --git a/tests/testthat/test_q_matrix.R b/tests/testthat/test_q_matrix.R
index d6f5c273..3f29dc56 100644
--- a/tests/testthat/test_q_matrix.R
+++ b/tests/testthat/test_q_matrix.R
@@ -15,9 +15,7 @@ test_that("q_matrix reads q_files",{
   expect_true(inherits(anolis_q_k3_mat,"q_matrix"))
 
   #read multiple .Q
-  package_path <- find.package("tidypopgen")
-  folder <- "/extdata/anolis"
-  q_folder <- file.path(package_path, folder)
+  q_folder <- system.file("/extdata/anolis", package = "tidypopgen")
   anolis_q <- q_matrix(q_folder)
   expect_true(inherits(anolis_q,"q_matrix_list"))
   expect_true(inherits(anolis_q[[1]][[1]],"q_matrix"))