-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDNAreadsToBytes.R
530 lines (362 loc) · 17.7 KB
/
DNAreadsToBytes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
require(stringi)
require(data.table)
require(Biostrings)
require(matrixStats)
options("stringsAsFactors" = F)
paths <- commandArgs(T)
# path to fasta file
input <- paths[1]
# output directory
outputFolder <- paths[2]
if (length(paths) < 3) {
huffmanCode <- gsub("//", "/", stri_c(outputFolder, "/View_huff3.cd.new.correct"), fixed = T)
} else {
huffmanCode <- paths[3]
}
paths <- c(input, outputFolder, huffmanCode)
missing <- !file.exists(paths)
if (any(missing)) stop(paste("missing file(s):", paste(paths[missing], collapse = ", ")))
# defining "basic" functions used in the script-----------------------------------------
baseTypes <- c("A", "C", "G", "T")
# matrix used to convert successive bases to trits, used below
conv <- matrix(c(NA, 0:2, 2L, NA, 0:1, 1:2, NA, 0L, 0:2, NA), 4, byrow = T)
colnames(conv) <- baseTypes
rownames(conv) <- baseTypes
# converts a matrix of DNA bases (sequences as rows) to integer trits according to Goldman et al's scheme
toTrits <- function(mat) {
# in case mat is a vector, we convert it to a 1-row matrix (otherwise we would have dimension issues)
if (!is.matrix(mat)) mat <- rbind(mat)
vapply(2:ncol(mat), function(col) conv[cbind(mat[, col - 1], mat[, col])], integer(nrow(mat)))
}
# splits a string vector into a matrix with one character per cell. Words are
# rows. The number of columns is the length of the longest word. We use as
# substring function (stri_sub()), which appears to be faster than a split
stringToMatrix <- function(vector) {
nchars <- stri_length(vector)
m <- max(nchars)
mat <- matrix(NA, nrow = length(vector), ncol = m, byrow = T)
for (i in 1:m) {
mat[, i] <- stri_sub(vector, i, i)
}
mat
}
# reverse complements a character vector representing DNA sequences. Ambiguities are not treated
revCom <- function(seqs) {
stri_reverse(chartr("acgtACGT", "tgcaTGCA", seqs))
}
# converts numbers of a given base to base 10, assuming digits are in separate
# columns of a matrix, so that rows represent numbers read from left to right
toBase10 <- function(mat, base = 3) {
if (!is.matrix(mat)) mat <- rbind(mat)
res <- 0
ncol <- ncol(mat)
for (col in ncol:1) {
res <- res + mat[, col] * base^(ncol - col)
}
res
}
# tells the occurrence of the element found, for all positions of a vector (e.g.
# 1 if it's the first time it appears, 2 if the 2nd time, etc)
occurrences <- function(vect) {
dt <- data.table(vect, pos = 1:length(vect))
counts <- dt[, .(occ = 1:.N, pos), by = vect]
res <- integer(length(vect))
res[counts$pos] <- counts$occ
res
}
# returns the first position (column) of each element of "values" in each
# corresponding row of matrix "mat" (recycling may work, as values can have 1
# element)
rowMatches <- function(values, mat) {
res <- which(mat == values, arr.ind = T)
res[match(1:nrow(mat), res[, 1]), 2]
}
# splits a character vector into columns (like excel). A column range can be
# specified as an integer vector and "cells" are filled with NAs if needed. By
# default, it generate as many columns as necessary given the character(s) used
# to split the vector elements.
splitToColumns <- function(vector, split, columns) {
vector <- as.character(vector)
res <- stri_split_fixed(vector, split, omit_empty = NA, simplify = T)
if (missing(columns)) {
columns <- 1:ncol(res)
}
if (any(columns < 1)) {
stop("inappropriate column parameter. Use integers >=1")
}
columns <- round(columns)
res <- res[, columns]
}
# STEP ONE. Data import and initial processing ---------------------------------------------------------------
code <- readLines(huffmanCode, skipNul = T)
# We split lines into a table (discards the last line, as it apparently doesn't correspond to a byte)
code <- data.table(splitToColumns(code[-length(code)], "\t"))
code[, byte := as.hexmode(as.integer(V3))]
cat("importing reads...\n")
# we ingest all read sequences, ignoring names. This takes memory
reads <- readDNAStringSet(input, use.names = F)
# we place them in a data table
reads <- data.table(SEQ = as.character(reads))
# so we can rapidly discard duplicate read sequences, but retain how many reads
# we had per sequence (much faster than table())
reads <- reads[, .N, by = SEQ]
# forward-sequenced reads should start by AT and end by GC (should be the
# opposite for reverse-sequenced reads)
cat("checking read orientation...\n")
reads[, c("ATstart", "GCend") := .(stri_sub(SEQ, 1, 1) %chin% c("A", "T"), stri_sub(SEQ, 117, 117) %chin% c("G", "C"))]
# contingency table used to evaluate if there are reads to reverse
nums <- matrix(reads[, sum(N), by = .(ATstart, GCend)]$V1, ncol = 2)
# This test should mean there are significantly more apparent reversed reads than expected from sequencing errors
if (chisq.test(nums)$p.value < 0.0001) {
reads[ATstart == F & GCend == F, SEQ := revCom(SEQ)]
}
reads[, c("ATstart", "GCend") := NULL]
# STEP TWO. Extraction of read index information -------------------------------------------
cat("extracting read indexing information...\n")
# We extract indexing information in the last 16 bases of a read (excluding the
# very last). We split them into a matrix of individual bases (rows = reads)
IX <- stringToMatrix(stri_sub(reads$SEQ, 101, 116))
IX <- toTrits(IX)
# We get the parity trit by summing the appropriate columns (odd numbers)
parity <- rowSums(IX[, seq(1, 13, 2)]) %% 3L
# We compare it to the 15th trit as an error check. Also, NAs in trits mean errors
error <- parity != IX[, 15] | rowSums(is.na(IX)) > 0
reads <- reads[!error]
IX <- IX[!error, ]
# We obtain file ID from first 2 trits (integer)
fileID <- IX[, 1] * 3L + IX[, 2]
# We obtain read location indices in this file from following 12 trits
idx <- toBase10(IX[, 3:14])
# We add indexing information to the table
reads[, c("FILE", "LOC") := .(fileID, as.integer(idx))]
# we may reclaim some memory
rm(fileID, idx, error, parity, IX)
# to reduce the workload (and free memory), we retain no more than 100 read
# sequences per location index per encoded file. This should be enough to call
# reliable consensuses. We favour sequences without apparent error
cat("sampling best reads...\n")
# values of this column will be TRUE if there are duplicate bases in a read, which shouldn't exist
reads[, err := grepl("AA|CC|GG|TT", SEQ)]
# we sort the table by placing these sequences at the bottom, and favour those represented by many reads
setorder(reads, FILE, LOC, err, -N)
# we retain no more than the first 100 sequences per location per file
reads <- reads[occurrences(stri_c(FILE, LOC, sep = " ")) <= 100, ]
# STEP TREE, Consensus calling -------------------------------------------------------
# we make a consensus sequences at each location index. We do it before the
# keystream conversion, as a single error may result in a borked read after this
# conversion. We will only convert the (hopefully error-free) consensuses.
cat("generating local consensuses...\n")
# We spot the 100 nt that encode file data into individual bases
reads <- reads[, data.table(stringToMatrix(stri_sub(SEQ, 2, 101)), N, FILE, LOC)]
# The function below counts the number of reads for each of the 4 possible bases at the 100
# positions of a location (takes a base matrix and the number of reads per
# sequences). We will do this for each location index
counts <- function(mat, Ns) {
# this does it the above for a given type of base
readCounts <- function(base) {
# We generate a matrix in which rows represent sequences and columns positions.
# It contains the number of reads per sequence (identical for all cells of
# the same row)
Nmat <- matrix(Ns, nrow(mat), ncol(mat))
# we remove counts for other bases
Nmat[mat != base] <- 0L
# so we can count reads for the base we're interested in
colSums(Nmat)
}
lapply(baseTypes, readCounts)
}
# needed to extract the columns containing the bases from the data table
coln <- names(reads)[1:100]
# We do the base counts by file and location. Note that the ".SD" special symbol
# of data.table would be shorter in code, but it's somehow much slower to run.
# This call creates a file in which each location is represented by 100 rows
# (consensus sequences will appear vertically)
countMat <- reads[, counts(do.call(cbind, mget(coln)), N), by = .(FILE, LOC)]
# We prepare a data.table that will contain consensus sequences
cons <- unique(countMat[, .(FILE, LOC)])
# We turn base counts into a matrix, for operations below
countMat <- as.matrix(countMat[, 3:6, with = F])
# at each position, we get the highest read counts among the 4 bases
maxCounts <- rowMaxs(countMat)
# and corresponding frequency among reads
maxFreqs <- maxCounts / rowSums(countMat)
# we compute a position quality score which takes the base frequency and the
# read count into account. Note that a read count <=1 will results in a score of
# ~0. This may not be appropriate for all experiments.
maxScores <- maxFreqs * (1 - 1 / maxCounts)
# we now select the most frequent base at each position, into a vector
consBases <- baseTypes[rowMatches(maxCounts, countMat)]
# and since there are exactly 100 bases per location, we can easily convert this
# vector to matrix where one row is a location
consBases <- matrix(consBases, ncol = 100, byrow = T)
# we do the same for the scores
consScores <- matrix(maxScores, ncol = 100, byrow = T)
rm(countMat, maxCounts, maxFreqs, maxScores)
# STEP FOUR, keystream decoding --------------------------------------------------------
# We do the keystream decoding on consensus sequences. The four different keys
# are used as rows of an integer matrix:
cat("keysteam decoding...\n")
ks <- stringToMatrix(c(
"002000010110102111112122210011122221010102121222022000221201020221002121121000212222021211121122221",
"202020122121210120001200210222112020222022222220220001221012111022121120202022211221112202002121022",
"221221101200221120220011002222100000020200021121021020122100021201010210202002000101020022121100100",
"100122100011112100120210020011102201122122100100120122212000021220022012202201100010212222110222020"))
storage.mode(ks) <- "integer"
# We convert bases to numbers
d <- chartr("ACGT", "0123", consBases)
storage.mode(d) <- "integer"
# We save the first number (base) of each consensus, for later
first <- d[, 1]
# We compute base-4 differences between adjacent numbers, then subtracting 1
d <- (d[, 2:100] - d[, 1:99]) %% 4L - 1L
# We determine the key to subtract (row of the ks matrix to use) depending on sequence location
phase <- cons$LOC %% 4L + 1L
# We subtract (base 3) the appropriate key and add 1
d <- (d - ks[phase, ]) %% 3L + 1L
# We sum (base 4) successive trits to obtain base numbers (the reciprocal of the difference step)
d <- rowCumsums(cbind(first, d)) %% 4L
# We reverse complement sequences at odd location indices. We use the fact that
# bases are still encoded as numbers, so 3 - base is the complement of base
odd <- cons$LOC %% 2L > 0
d[odd, ] <- 3L - d[odd, ncol(d):1]
# We convert numbers back to bases (chartr() may also work)
d <- matrix(baseTypes[d + 1], nrow = nrow(d))
# We add flattened consensus sequences and average score at each location index to
# the table (we actually don't use these sequences afterwards, but it can be
# used as a visual check)
cons[, c("SEQ", "SCORE") := .(do.call(stri_c, as.data.frame(d)), rowMeans(consScores))]
# we now estimate the length of encoded files, based on the location indices of
# high-quality consensuses. To avoid generating unnecessary long or broken DNA
# sequences
# we do this by creating a table discarding low-score consensuses
temp <- cons[SCORE > 0.75, .(FILE, LOC)]
rows <- 2:nrow(temp)
# We then find "gaps" between successive retained locations (a gap of 6 is arbitrary)
gap <- temp[, c(LOC[rows] - LOC[rows - 1] > 6, F)]
# the last location index with good consensus for each file
ends <- temp[gap == T, min(LOC), by = FILE]$V1
# and the first
starts <- temp[, min(LOC), by = FILE]$V1
# We retain file numbers with enough "good" locations (20 is arbitrary)
valid <- which((ends - starts) > 20) - 1
if (length(valid) == 0) stop("no file with reliable sequence could be found. Exiting.")
rm(temp, starts, gap, rows)
cat(stri_c("file", valid, ", estimated size: ", ends[valid + 1] * 25), sep = "\n")
# STEP FIVE, file consensus generation -----------------------------------------------------
# We now generate the global consensus sequences for files
cat("generating file DNA sequences...\n")
# for this, we cbind the base and score matrices (even if that turns numbers
# into strings). Not very elegant but practical in the following function
d <- cbind(d, consScores)
# We concatenate and overlap successive local consensuses from a file
overlapConsensuses <- function(file) {
# they will be stored in this matrix, vertically. We have 2*4 columns as we
# save the base and its score for the 4 overlapping consensuses at each
# position
mat <- matrix("", ncol = 8, nrow = ends[file + 1] * 25 + 100)
# We select the data table rows we need (that is, excluding what's after the estimated file end)
f <- cons[, FILE == file & LOC <= ends[file + 1]]
# and corresponding location indices
loc <- cons[f == T, LOC]
# we tolerate that some locations (before the last one) may not be represented, as consensuses overlap
missing <- setdiff(1:ends[file + 1], loc)
loc <- c(loc, missing)
# we extract the relevant rows of the base/score matrix, and add empty rows for missing locations
bases <- rbind(d[f, ], matrix("", length(missing), 200))
# phase = the future column of the consensus in the result matrix
phase <- loc %% 4L
# this loop concatenates consensuses of the same "phase", since they are adjacent and not overlapping
for (p in 0:3) {
# TRUE fo the consensuses of the given phase
f <- phase == p
# the first consensus for this phase will start at this offset in the file
start <- p * 25 + 1
# the rows of the "d" matrix corresponding to the location indices of this phase
rows <- which(f)[order(loc[f])]
# the last position of the concatenated consensus in mat
end <- start + length(rows) * 100 - 1
# we can now concatenate consensuses (matrix rows) for each phase and place them in mat
mat[start:end, p + 1] <- t(bases[rows, 1:100])
mat[start:end, p + 5] <- t(bases[rows, 101:200])
}
# we add file ID as the last column
cbind(mat, as.character(file))
}
# we apply the function to each valid file. We rbind all in a single matrix
overlaps <- do.call(rbind, lapply(valid, overlapConsensuses))
# we extract the base scores and file ID
baseInfo <- overlaps[, 5:9]
storage.mode(baseInfo) <- "numeric"
baseInfo[is.na(baseInfo)] <- 0
# for each of the 4 bases, we sum its scores over the 4 overlapping consensuses
baseScores <- function(base) {
# we store scores in a temporary matrix in the function, as we replace them with zeros for the other bases
scores <- baseInfo[, 1:4]
scores[overlaps[, 1:4] != base] <- 0
rowSums(scores)
}
# applying the function for all bases
scores <- vapply(baseTypes, baseScores, numeric(nrow(overlaps)))
maxScores <- rowMaxs(scores)
# hist(maxScores, breaks = 100)
# we retain the base with highest score at each position
consensus <- baseTypes[rowMatches(maxScores, scores)]
# and we create a data table with relevant info (some actually not used afterwards)
consensus <- data.table(
base = consensus,
score = maxScores / rowSums(overlaps[, 1:4] != ""),
file = baseInfo[, 5],
pos = occurrences(baseInfo[, 5])
)
# hist(consensus$score, breaks = 100, xlim = 0:1, main = "position quality scores")
# STEP SIX, we convert DNA bases to bytes ---------------------------------------------
cat("converting to bytes and saving to file(s)...\n")
# this function does not tolerate missing or erroneous data in files, TO IMPROVE
writeFile <- function(fileID, folder) {
# the DNA sequence for each file, as a base vector. We prepend base "A", as specified
bases <- c("A", consensus[file == fileID, base])
trits <- toTrits(bases)
# we compute the length of the file, encoded in the first 25 trits.
len <- toBase10(trits[1:25])
if (len > length(trits) - 25 | is.na(len)) {
warning("possible broken file\n")
len <- length(trits) - 25
}
# because "NA" would take 2 chars in the flattened string
trits[is.na(trits)] <- 3L
# we flatten the trit vector in a string
flat <- stri_flatten(trits[26:(len + 25)])
# we extract 5- and 6-trit words from that string. We need to find where the 6-trit words are
# all the possible 5-trit words in the string
word5 <- stri_sub(flat, 1:(len - 4), length = 5)
# same for 6-trit words
word6 <- stri_sub(flat, 1:(len - 5), length = 6)
# we make sure both are in equal numbers (needed to create the matrix below)
word6 <- c(word6, rep("", length(word5) - length(word6)))
# we determine whether words are legal, as a 2-column boolean matrix
allowed <- cbind(word5 %chin% code$V4, word6 %chin% code$V4)
# to iterate over this matrix (see loop below),
col <- 1:2
row <- 1
# cells of this matrix will be TRUE for the words we retain
retained <- matrix(F, length(word5), 2)
# we iterate from row 1
while (row <= nrow(allowed)) {
if (allowed[row, col[1]]) {
# if word is legal we mark its location as TRUE
retained[row, col[1]] <- T
# and move to the row of the next word to check (based on word size)
row <- row + 4 + col[1]
} else {
# else if word of alternate length is illegal as well, we abort
if (!allowed[row, col[2]]) stop(paste("error in file", fileID, "at position", row))
# if not, we go check the word of alternate length at this position, in the other column
col <- rev(col)
}
}
words <- rbind(word5, word6)[t(retained)]
bytes <- code[chmatch(words, V4), byte]
writeBin(as.raw(bytes), con = gsub("//", "/", stri_c(folder, "/file", fileID), fixed = T))
invisible(NULL)
}
for (file in valid) writeFile(file, outputFolder)