Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check that reads don't exceed the maximal length #44

Merged
merged 5 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mutscan
Title: Preprocessing and Analysis of Deep Mutational Scanning Data
Version: 0.2.35
Version: 0.2.36
Authors@R:
c(person(given = "Charlotte",
family = "Soneson",
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# mutscan 0.2.36

* Check that reads don't exceed the maximal allowed length
* Add parameter to specify maximal read length

# mutscan 0.2.35

* Add alternative names for variants (including HGVS identifiers)
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ findClosestRefSeqEarlyStop <- function(varSeq, wtSeq, upperBoundMismatch, sim) {
.Call(`_mutscan_findClosestRefSeqEarlyStop`, varSeq, wtSeq, upperBoundMismatch, sim)
}

digestFastqsCpp <- function(fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward = 20.0, avePhredMinReverse = 20.0, variableNMaxForward = 0L, variableNMaxReverse = 0L, umiNMax = 0L, nbrMutatedCodonsMaxForward = 1L, nbrMutatedCodonsMaxReverse = 1L, nbrMutatedBasesMaxForward = -1L, nbrMutatedBasesMaxReverse = -1L, forbiddenMutatedCodonsForward = "NNW", forbiddenMutatedCodonsReverse = "NNW", useTreeWTmatch = FALSE, collapseToWTForward = FALSE, collapseToWTReverse = FALSE, mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, mutNameDelimiter = ".", constantMaxDistForward = -1L, constantMaxDistReverse = -1L, variableCollapseMaxDist = 0.0, variableCollapseMinReads = 0L, variableCollapseMinRatio = 0.0, umiCollapseMaxDist = 0.0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1L, verbose = FALSE, nThreads = 1L, chunkSize = 100000L) {
.Call(`_mutscan_digestFastqsCpp`, fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward, avePhredMinReverse, variableNMaxForward, variableNMaxReverse, umiNMax, nbrMutatedCodonsMaxForward, nbrMutatedCodonsMaxReverse, nbrMutatedBasesMaxForward, nbrMutatedBasesMaxReverse, forbiddenMutatedCodonsForward, forbiddenMutatedCodonsReverse, useTreeWTmatch, collapseToWTForward, collapseToWTReverse, mutatedPhredMinForward, mutatedPhredMinReverse, mutNameDelimiter, constantMaxDistForward, constantMaxDistReverse, variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio, umiCollapseMaxDist, filteredReadsFastqForward, filteredReadsFastqReverse, maxNReads, verbose, nThreads, chunkSize)
digestFastqsCpp <- function(fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward = 20.0, avePhredMinReverse = 20.0, variableNMaxForward = 0L, variableNMaxReverse = 0L, umiNMax = 0L, nbrMutatedCodonsMaxForward = 1L, nbrMutatedCodonsMaxReverse = 1L, nbrMutatedBasesMaxForward = -1L, nbrMutatedBasesMaxReverse = -1L, forbiddenMutatedCodonsForward = "NNW", forbiddenMutatedCodonsReverse = "NNW", useTreeWTmatch = FALSE, collapseToWTForward = FALSE, collapseToWTReverse = FALSE, mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, mutNameDelimiter = ".", constantMaxDistForward = -1L, constantMaxDistReverse = -1L, variableCollapseMaxDist = 0.0, variableCollapseMinReads = 0L, variableCollapseMinRatio = 0.0, umiCollapseMaxDist = 0.0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1L, verbose = FALSE, nThreads = 1L, chunkSize = 100000L, maxReadLength = 1024L) {
.Call(`_mutscan_digestFastqsCpp`, fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward, avePhredMinReverse, variableNMaxForward, variableNMaxReverse, umiNMax, nbrMutatedCodonsMaxForward, nbrMutatedCodonsMaxReverse, nbrMutatedBasesMaxForward, nbrMutatedBasesMaxReverse, forbiddenMutatedCodonsForward, forbiddenMutatedCodonsReverse, useTreeWTmatch, collapseToWTForward, collapseToWTReverse, mutatedPhredMinForward, mutatedPhredMinReverse, mutNameDelimiter, constantMaxDistForward, constantMaxDistReverse, variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio, umiCollapseMaxDist, filteredReadsFastqForward, filteredReadsFastqReverse, maxNReads, verbose, nThreads, chunkSize, maxReadLength)
}

mergeValues <- function(mutNamesIn, valuesIn, delimiter = ',') {
Expand Down
10 changes: 8 additions & 2 deletions R/digestFastqs.R
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@
#' @param chunkSize Numeric scalar, the number of read (pairs) to keep in
#' memory for parallel processing. Reduce from the default value if you
#' run out of memory.
#' @param maxReadLength Numeric scalar, the maximum allowed read length. Longer
#' read lengths lead to higher memory allocation, and may require
#' the \code{chunkSize} to be decreased.
#'
#' @return A list with four entries:
#' \describe{
Expand Down Expand Up @@ -369,7 +372,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL,
filteredReadsFastqForward = "",
filteredReadsFastqReverse = "",
maxNReads = -1, verbose = FALSE,
nThreads = 1, chunkSize = 100000) {
nThreads = 1, chunkSize = 100000,
maxReadLength = 1024) {
## pre-flight checks ---------------------------------------------------------
## fastq files exist
if (length(fastqForward) < 1 || !all(file.exists(fastqForward)) ||
Expand Down Expand Up @@ -429,6 +433,7 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL,
validValues = -1)
.assertScalar(x = nThreads, type = "numeric", rngExcl = c(0, Inf))
.assertScalar(x = chunkSize, type = "numeric", rngExcl = c(0, Inf))
.assertScalar(x = maxReadLength, type = "numeric", rngExcl = c(0, Inf))

## If a wildtype sequence is provided, it must be unambiguous how to identify and name mutants
if (any(wildTypeForward != "")) {
Expand Down Expand Up @@ -717,7 +722,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL,
maxNReads = maxNReads,
verbose = verbose,
nThreads = as.integer(nThreads),
chunkSize = as.integer(chunkSize))
chunkSize = as.integer(chunkSize),
maxReadLength = maxReadLength)

## Add package version and processing date -----------------------------------
res$parameters$processingInfo <- paste0(
Expand Down
7 changes: 6 additions & 1 deletion man/digestFastqs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion src/FastqBuffer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using namespace Rcpp;
class FastqBuffer {
private:
size_t nentries; // number of fastq entries allocated (BUFFER_SIZE-1 max length)
size_t BUFFER_SIZE; // buffer size
bool paired; // paired read entries?
char *buffer; // single block of memory
// each entry e has 2-times (4-times for paired=true) BUFFER_SIZE bytes available
Expand All @@ -17,8 +18,9 @@ class FastqBuffer {
char *seq1, *qual1, *seq2, *qual2;

// constructor
FastqBuffer(size_t n, bool p = true) {
FastqBuffer(size_t n, size_t b, bool p = true) {
nentries = n;
BUFFER_SIZE = b;
paired = p;
buffer = new char[nentries * BUFFER_SIZE * (paired ? 4 : 2)];
seq1 = buffer;
Expand All @@ -36,6 +38,7 @@ class FastqBuffer {
FastqBuffer(const FastqBuffer& fqb) {
paired = fqb.paired;
nentries = fqb.nentries;
BUFFER_SIZE = fqb.BUFFER_SIZE;
buffer = new char[nentries * BUFFER_SIZE * (paired ? 4 : 2)];
strncpy(buffer, fqb.buffer, nentries * BUFFER_SIZE * (paired ? 4 : 2));
seq1 = buffer;
Expand Down
9 changes: 5 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ BEGIN_RCPP
END_RCPP
}
// digestFastqsCpp
List digestFastqsCpp(std::vector<std::string> fastqForwardVect, std::vector<std::string> fastqReverseVect, bool mergeForwardReverse, size_t minOverlap, size_t maxOverlap, size_t minMergedLength, size_t maxMergedLength, double maxFracMismatchOverlap, bool greedyOverlap, bool revComplForward, bool revComplReverse, std::string elementsForward, std::vector<int> elementLengthsForward, std::string elementsReverse, std::vector<int> elementLengthsReverse, std::string adapterForward, std::string adapterReverse, std::vector<std::string> primerForward, std::vector<std::string> primerReverse, std::vector<std::string> wildTypeForward, std::vector<std::string> wildTypeForwardNames, std::vector<std::string> wildTypeReverse, std::vector<std::string> wildTypeReverseNames, std::vector<std::string> constantForward, std::vector<std::string> constantReverse, double avePhredMinForward, double avePhredMinReverse, int variableNMaxForward, int variableNMaxReverse, int umiNMax, int nbrMutatedCodonsMaxForward, int nbrMutatedCodonsMaxReverse, int nbrMutatedBasesMaxForward, int nbrMutatedBasesMaxReverse, CharacterVector forbiddenMutatedCodonsForward, CharacterVector forbiddenMutatedCodonsReverse, bool useTreeWTmatch, bool collapseToWTForward, bool collapseToWTReverse, double mutatedPhredMinForward, double mutatedPhredMinReverse, std::string mutNameDelimiter, int constantMaxDistForward, int constantMaxDistReverse, double variableCollapseMaxDist, int variableCollapseMinReads, double variableCollapseMinRatio, double umiCollapseMaxDist, std::string filteredReadsFastqForward, std::string filteredReadsFastqReverse, int maxNReads, bool verbose, int nThreads, int chunkSize);
RcppExport SEXP _mutscan_digestFastqsCpp(SEXP fastqForwardVectSEXP, SEXP fastqReverseVectSEXP, SEXP mergeForwardReverseSEXP, SEXP minOverlapSEXP, SEXP maxOverlapSEXP, SEXP minMergedLengthSEXP, SEXP maxMergedLengthSEXP, SEXP maxFracMismatchOverlapSEXP, SEXP greedyOverlapSEXP, SEXP revComplForwardSEXP, SEXP revComplReverseSEXP, SEXP elementsForwardSEXP, SEXP elementLengthsForwardSEXP, SEXP elementsReverseSEXP, SEXP elementLengthsReverseSEXP, SEXP adapterForwardSEXP, SEXP adapterReverseSEXP, SEXP primerForwardSEXP, SEXP primerReverseSEXP, SEXP wildTypeForwardSEXP, SEXP wildTypeForwardNamesSEXP, SEXP wildTypeReverseSEXP, SEXP wildTypeReverseNamesSEXP, SEXP constantForwardSEXP, SEXP constantReverseSEXP, SEXP avePhredMinForwardSEXP, SEXP avePhredMinReverseSEXP, SEXP variableNMaxForwardSEXP, SEXP variableNMaxReverseSEXP, SEXP umiNMaxSEXP, SEXP nbrMutatedCodonsMaxForwardSEXP, SEXP nbrMutatedCodonsMaxReverseSEXP, SEXP nbrMutatedBasesMaxForwardSEXP, SEXP nbrMutatedBasesMaxReverseSEXP, SEXP forbiddenMutatedCodonsForwardSEXP, SEXP forbiddenMutatedCodonsReverseSEXP, SEXP useTreeWTmatchSEXP, SEXP collapseToWTForwardSEXP, SEXP collapseToWTReverseSEXP, SEXP mutatedPhredMinForwardSEXP, SEXP mutatedPhredMinReverseSEXP, SEXP mutNameDelimiterSEXP, SEXP constantMaxDistForwardSEXP, SEXP constantMaxDistReverseSEXP, SEXP variableCollapseMaxDistSEXP, SEXP variableCollapseMinReadsSEXP, SEXP variableCollapseMinRatioSEXP, SEXP umiCollapseMaxDistSEXP, SEXP filteredReadsFastqForwardSEXP, SEXP filteredReadsFastqReverseSEXP, SEXP maxNReadsSEXP, SEXP verboseSEXP, SEXP nThreadsSEXP, SEXP chunkSizeSEXP) {
List digestFastqsCpp(std::vector<std::string> fastqForwardVect, std::vector<std::string> fastqReverseVect, bool mergeForwardReverse, size_t minOverlap, size_t maxOverlap, size_t minMergedLength, size_t maxMergedLength, double maxFracMismatchOverlap, bool greedyOverlap, bool revComplForward, bool revComplReverse, std::string elementsForward, std::vector<int> elementLengthsForward, std::string elementsReverse, std::vector<int> elementLengthsReverse, std::string adapterForward, std::string adapterReverse, std::vector<std::string> primerForward, std::vector<std::string> primerReverse, std::vector<std::string> wildTypeForward, std::vector<std::string> wildTypeForwardNames, std::vector<std::string> wildTypeReverse, std::vector<std::string> wildTypeReverseNames, std::vector<std::string> constantForward, std::vector<std::string> constantReverse, double avePhredMinForward, double avePhredMinReverse, int variableNMaxForward, int variableNMaxReverse, int umiNMax, int nbrMutatedCodonsMaxForward, int nbrMutatedCodonsMaxReverse, int nbrMutatedBasesMaxForward, int nbrMutatedBasesMaxReverse, CharacterVector forbiddenMutatedCodonsForward, CharacterVector forbiddenMutatedCodonsReverse, bool useTreeWTmatch, bool collapseToWTForward, bool collapseToWTReverse, double mutatedPhredMinForward, double mutatedPhredMinReverse, std::string mutNameDelimiter, int constantMaxDistForward, int constantMaxDistReverse, double variableCollapseMaxDist, int variableCollapseMinReads, double variableCollapseMinRatio, double umiCollapseMaxDist, std::string filteredReadsFastqForward, std::string filteredReadsFastqReverse, int maxNReads, bool verbose, int nThreads, int chunkSize, size_t maxReadLength);
RcppExport SEXP _mutscan_digestFastqsCpp(SEXP fastqForwardVectSEXP, SEXP fastqReverseVectSEXP, SEXP mergeForwardReverseSEXP, SEXP minOverlapSEXP, SEXP maxOverlapSEXP, SEXP minMergedLengthSEXP, SEXP maxMergedLengthSEXP, SEXP maxFracMismatchOverlapSEXP, SEXP greedyOverlapSEXP, SEXP revComplForwardSEXP, SEXP revComplReverseSEXP, SEXP elementsForwardSEXP, SEXP elementLengthsForwardSEXP, SEXP elementsReverseSEXP, SEXP elementLengthsReverseSEXP, SEXP adapterForwardSEXP, SEXP adapterReverseSEXP, SEXP primerForwardSEXP, SEXP primerReverseSEXP, SEXP wildTypeForwardSEXP, SEXP wildTypeForwardNamesSEXP, SEXP wildTypeReverseSEXP, SEXP wildTypeReverseNamesSEXP, SEXP constantForwardSEXP, SEXP constantReverseSEXP, SEXP avePhredMinForwardSEXP, SEXP avePhredMinReverseSEXP, SEXP variableNMaxForwardSEXP, SEXP variableNMaxReverseSEXP, SEXP umiNMaxSEXP, SEXP nbrMutatedCodonsMaxForwardSEXP, SEXP nbrMutatedCodonsMaxReverseSEXP, SEXP nbrMutatedBasesMaxForwardSEXP, SEXP nbrMutatedBasesMaxReverseSEXP, SEXP forbiddenMutatedCodonsForwardSEXP, SEXP forbiddenMutatedCodonsReverseSEXP, SEXP useTreeWTmatchSEXP, SEXP collapseToWTForwardSEXP, SEXP collapseToWTReverseSEXP, SEXP mutatedPhredMinForwardSEXP, SEXP mutatedPhredMinReverseSEXP, SEXP mutNameDelimiterSEXP, SEXP constantMaxDistForwardSEXP, SEXP constantMaxDistReverseSEXP, SEXP variableCollapseMaxDistSEXP, SEXP variableCollapseMinReadsSEXP, SEXP variableCollapseMinRatioSEXP, SEXP umiCollapseMaxDistSEXP, SEXP filteredReadsFastqForwardSEXP, SEXP filteredReadsFastqReverseSEXP, SEXP maxNReadsSEXP, SEXP verboseSEXP, SEXP nThreadsSEXP, SEXP chunkSizeSEXP, SEXP maxReadLengthSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand Down Expand Up @@ -207,7 +207,8 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
Rcpp::traits::input_parameter< int >::type nThreads(nThreadsSEXP);
Rcpp::traits::input_parameter< int >::type chunkSize(chunkSizeSEXP);
rcpp_result_gen = Rcpp::wrap(digestFastqsCpp(fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward, avePhredMinReverse, variableNMaxForward, variableNMaxReverse, umiNMax, nbrMutatedCodonsMaxForward, nbrMutatedCodonsMaxReverse, nbrMutatedBasesMaxForward, nbrMutatedBasesMaxReverse, forbiddenMutatedCodonsForward, forbiddenMutatedCodonsReverse, useTreeWTmatch, collapseToWTForward, collapseToWTReverse, mutatedPhredMinForward, mutatedPhredMinReverse, mutNameDelimiter, constantMaxDistForward, constantMaxDistReverse, variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio, umiCollapseMaxDist, filteredReadsFastqForward, filteredReadsFastqReverse, maxNReads, verbose, nThreads, chunkSize));
Rcpp::traits::input_parameter< size_t >::type maxReadLength(maxReadLengthSEXP);
rcpp_result_gen = Rcpp::wrap(digestFastqsCpp(fastqForwardVect, fastqReverseVect, mergeForwardReverse, minOverlap, maxOverlap, minMergedLength, maxMergedLength, maxFracMismatchOverlap, greedyOverlap, revComplForward, revComplReverse, elementsForward, elementLengthsForward, elementsReverse, elementLengthsReverse, adapterForward, adapterReverse, primerForward, primerReverse, wildTypeForward, wildTypeForwardNames, wildTypeReverse, wildTypeReverseNames, constantForward, constantReverse, avePhredMinForward, avePhredMinReverse, variableNMaxForward, variableNMaxReverse, umiNMax, nbrMutatedCodonsMaxForward, nbrMutatedCodonsMaxReverse, nbrMutatedBasesMaxForward, nbrMutatedBasesMaxReverse, forbiddenMutatedCodonsForward, forbiddenMutatedCodonsReverse, useTreeWTmatch, collapseToWTForward, collapseToWTReverse, mutatedPhredMinForward, mutatedPhredMinReverse, mutNameDelimiter, constantMaxDistForward, constantMaxDistReverse, variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio, umiCollapseMaxDist, filteredReadsFastqForward, filteredReadsFastqReverse, maxNReads, verbose, nThreads, chunkSize, maxReadLength));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -276,7 +277,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_mutscan_test_mergeReadPairPartial", (DL_FUNC) &_mutscan_test_mergeReadPairPartial, 12},
{"_mutscan_findClosestRefSeq", (DL_FUNC) &_mutscan_findClosestRefSeq, 4},
{"_mutscan_findClosestRefSeqEarlyStop", (DL_FUNC) &_mutscan_findClosestRefSeqEarlyStop, 4},
{"_mutscan_digestFastqsCpp", (DL_FUNC) &_mutscan_digestFastqsCpp, 54},
{"_mutscan_digestFastqsCpp", (DL_FUNC) &_mutscan_digestFastqsCpp, 55},
{"_mutscan_mergeValues", (DL_FUNC) &_mutscan_mergeValues, 3},
{"_mutscan_levenshtein_distance", (DL_FUNC) &_mutscan_levenshtein_distance, 3},
{"_mutscan_hamming_distance", (DL_FUNC) &_mutscan_hamming_distance, 3},
Expand Down
Loading