-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnp_ibs.R
91 lines (78 loc) · 3.31 KB
/
snp_ibs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#' Compute the Identity by State Matrix for a bigSNP object
#'
#' This function computes the IBS matrix.
#'
#' Note that monomorphic sites are currently counted. Should we filter
#' them beforehand? What does plink do?
#' @param X a [bigstatsr::FBM.code256] matrix (as found in the `genotypes`
#' slot of a [bigsnpr::bigSNP] object).
#' @param ind.row An optional vector of the row indices that are used.
#' If not specified, all rows are used. Don't use negative indices.
#' @param ind.col An optional vector of the column indices that are used. If not
#' specified, all columns are used. Don't use negative indices.
#' @param type one of "proportion" (equivalent to "ibs" in PLINK), "adjusted_counts" ("distance" in PLINK),
#' and "raw_counts" (the counts of identical alleles and non-missing alleles, from which the two other quantities are computed)
#' @param block.size maximum number of columns read at once. Note that, to optimise the
#' speed of matrix operations, we have to store in memory 3 times the columns.
#' @returns if as.counts = TRUE function returns a list of two [bigstatsr::FBM] matrices, one of counts of IBS by alleles (i.e. 2*n loci),
#' and one of valid alleles (i.e. 2 * n_loci - 2 * missing_loci). If as.counts = FALSE returns a single matrix of IBS proportions.
#' @export
snp_ibs <- function(
X,
ind.row = bigstatsr::rows_along(X),
ind.col = bigstatsr::cols_along(X),
type = c("proportion","adjusted_counts","raw_counts"),
block.size = bigstatsr::block_size(nrow(X))
) {
type <- match.arg(type)
#check_args()
n <- length(ind.row)
# FBM matrix to count the IBS counts
IBS <- bigstatsr::FBM(n, n, init = 0)
# FBM matrix to store valid number of comparisons (i.e NOT NA)
IBS_valid_loci <- bigstatsr::FBM(n, n, init = 0)
m <- length(ind.col)
intervals <- CutBySize(m, block.size)
# Preassign memory where we will store the slices of genotypes.
# For efficiency, when we read in the slice,
# we will immediately recode it into these3 matrices,
# one per genotype, equivalent to X==0, X==1, and X==2 respectively
X_0_part <- matrix(0, n, max(intervals[, "size"]))
X_1_part <- matrix(0, n, max(intervals[, "size"]))
X_2_part <- matrix(0, n, max(intervals[, "size"]))
for (j in bigstatsr::rows_along(intervals)) {
ind <- seq2(intervals[j, ]) # this iteration indices
ind.col.ind <- ind.col[ind] #subset given indices by the iteration indices
increment_ibs_counts(IBS,
IBS_valid_loci,
X_0_part,
X_1_part,
X_2_part,
X,
ind.row,
ind.col.ind)
}
if(type == "raw_counts"){
return(list(ibs = IBS, valid_n = IBS_valid_loci))
} else{
#IBS proportion
# get the means of each column
divide_sub <- function(X, ind, Y) (X[, ind]/Y[,ind])
ibs_prop <- bigstatsr::big_apply(IBS, a.FUN = divide_sub, Y=IBS_valid_loci, a.combine = 'cbind')
if (type=="proportion"){
return(ibs_prop)
} else { # for adjusted counts
ibs_adj <- ibs_prop* m
return(ibs_adj)
}
}
}
## convenience functions that are not exported by `bigstatsr`
CutBySize <- function (m, block.size, nb = ceiling(m/block.size))
{
bigparallelr::split_len(m, nb_split = nb)
}
seq2<- function (lims)
{
seq(lims[1], lims[2])
}