schatzlab · gf777 · Apr 25, 2017 · Apr 25, 2017 · Apr 25, 2017 · Apr 25, 2017
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ user_data/*
 user_uploads/*
 !user_uploads/index.html
 
+.Rproj.user
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: genomescope
+Title: Reference-free profiling of genomes
+Version: 2.0
+Authors@R: person("Timothy", "Ranallo-Benavidez", email = "tbenavi1@jhu.edu",
+                  role = c("aut", "cre"))
+Description: GenomeScope analyzes the k-mer histogram to output estimates for genome size, heterozygosity, and repetitiveness, without requiring a reference genome. GenomeScope employs a polyploid-aware mixture model that, within seconds, accurately infers genome properties from unassembled sequencing data. GenomeScope produces a report and several informative plots describing the genome properties.
+Depends: R (>= 3.1.0)
+Imports: argparse, minpack.lm
+License: file LICENSE
+LazyData: true
+RoxygenNote: 6.1.1
+Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,71 @@
+# Generated by roxygen2: do not edit by hand
+
+export(estimate_Genome_peakp)
+export(eval_model)
+export(nls_peak)
+export(predict1_0)
+export(predict1_0_unique)
+export(predict1_1)
+export(predict1_1_unique)
+export(predict2_0)
+export(predict2_0_unique)
+export(predict2_1)
+export(predict2_1_unique)
+export(predict3_0)
+export(predict3_0_unique)
+export(predict3_1)
+export(predict3_1_unique)
+export(predict4_0)
+export(predict4_0_unique)
+export(predict4_1)
+export(predict4_1_unique)
+export(predict4_2)
+export(predict4_2_unique)
+export(predict5_0)
+export(predict5_0_unique)
+export(predict5_1)
+export(predict5_1_unique)
+export(predict5_2)
+export(predict5_2_unique)
+export(predict5_3)
+export(predict5_3_unique)
+export(predict5_4)
+export(predict5_4_unique)
+export(predict5_5)
+export(predict5_5_unique)
+export(predict6_0)
+export(predict6_0_unique)
+export(predict6_1)
+export(predict6_10)
+export(predict6_10_unique)
+export(predict6_11)
+export(predict6_11_unique)
+export(predict6_12)
+export(predict6_12_unique)
+export(predict6_13)
+export(predict6_13_unique)
+export(predict6_14)
+export(predict6_14_unique)
+export(predict6_15)
+export(predict6_15_unique)
+export(predict6_16)
+export(predict6_16_unique)
+export(predict6_1_unique)
+export(predict6_2)
+export(predict6_2_unique)
+export(predict6_3)
+export(predict6_3_unique)
+export(predict6_4)
+export(predict6_4_unique)
+export(predict6_5)
+export(predict6_5_unique)
+export(predict6_6)
+export(predict6_6_unique)
+export(predict6_7)
+export(predict6_7_unique)
+export(predict6_8)
+export(predict6_8_unique)
+export(predict6_9)
+export(predict6_9_unique)
+export(report_results)
+export(score_model)
diff --git a/R/estimate_genome_peak.R b/R/estimate_genome_peak.R
@@ -0,0 +1,68 @@
+#' Function to fit 2p peak model, with p forms
+#'
+#' @param kmer_hist_orig A data frame of the original histogram data (starting at 1 and with last position removed).
+#' @param x An integer vector of the x-coordinates of the histogram (after filtering out low coverage errors and high coverage kmers).
+#' @param y A numeric vector of the y-coordinates of the histogram (after filtering out low coverage errors and high coverage kmers).
+#' @param k An integer corresponding to the kmer length.
+#' @param p An integer corresponding to the ploidy.
+#' @param topology An integer corresponding to the topology to use.
+#' @param estKmercov An integer corresponding to the estimated kmer coverage of the polyploid genome.
+#' Set to -1 if not specified by user.
+#' @param round An integer corresponding to the iteration number (0, 1, 2, 3) for the fitting process.
+#' @param foldername A character vector corresponding to the name of the output directory.
+#' @param arguments A data frame of the user-specified inputs.
+#' @return A list (nls, nlsscore) where nls is the nlsLM model object (with some additional components)
+#' and nlsscore is the score (model RSSE) corresponding to the best fit (of the p forms).
+#' @export
+estimate_Genome_peakp<-function(kmer_hist_orig, x, y, k, p, topology, estKmercov, round, foldername, arguments) {
+  if (topology==-1) {
+    p_to_num_topologies = c(1, 1, 1, 2, 5, 16)
+    num_topologies = p_to_num_topologies[p]
+    topologies = 1:num_topologies
+  }
+  else {
+    num_topologies = 1
+    topologies = c(topology)
+  }
+  numofKmers = sum(as.numeric(x)*as.numeric(y))
+  if (estKmercov==-1) {
+    #In situations with low heterozygosity, the peak with highest amplitude typically corresponds to the homozygous peak (i.e. the p-th peak).
+    #However, with increasing heterozygosity, the highest amplitude peak may be an earlier peak.
+    #Thus, when setting the estimated kmer coverage, we will need to iterate through these possibilities.
+    #num_peak_indices indicates how many possibilities we need to iterate through.
+    num_peak_indices = p
+    y_transform = as.numeric(x)**transform_exp*as.numeric(y)
+    estKmercov1 = x[which(y_transform==max(y_transform))][1]
+  }
+  else {
+    # When the user sets the estimated kmer coverage, we only need to iterate through one possibility
+    num_peak_indices = 1
+    ## We set the estimated kmer coverage to be the user specified value
+    estKmercov1 = estKmercov
+  }
+  estLength1 = numofKmers/estKmercov1
+
+  nls00 = NULL
+  peak_indices = 1:num_peak_indices
+  for (i in peak_indices) {
+    nls0 = NULL
+    top_count = 0
+    ## We see what happens when we set the estimated kmer coverage to be 1/i times the x-coordinate where the max peak occurs (1 <= i <= p if the user doesn't set the estimated kmer coverage, and i=1 if they do)
+    estKmercov2 = estKmercov1 / i
+    estLength2 = numofKmers/estKmercov2
+
+    if (VERBOSE) {cat(paste("trying with kmercov: ", estKmercov2, "\n"))}
+
+    for (top in topologies) {
+      if (VERBOSE) {cat(paste("trying with topology: ", top, "\n"))}
+      top_count = top_count + 1
+      nls1 = nls_peak(x, y, k, p, top, estKmercov2, estLength2, MAX_ITERATIONS)
+      nls0 = eval_model(kmer_hist_orig, nls0, nls1, p, round, foldername, arguments)[[1]]
+    }
+    if (i < num_peak_indices) { #if this is not the last evaluation
+      nls00 = eval_model(kmer_hist_orig, nls00, nls0, p, round, foldername, arguments)[[1]]
+    }
+  }
+
+  return(eval_model(kmer_hist_orig, nls00, nls0, p, round, foldername, arguments))
+}
diff --git a/R/eval_model.R b/R/eval_model.R
@@ -0,0 +1,86 @@
+#' Evaluate distinct model forms, in order to resolve ambiguity of which peak is the homozygous peak
+#'
+#' @param kmer_hist_orig A data frame of the original histogram data (starting at 1 and with last position removed).
+#' @param nls0,nls1 The nlsLM model objects to evaluate and compare.
+#' @param p An integer corresponding to the ploidy.
+#' @param round An integer corresponding to the iteration number (0, 1, 2, 3) for the fitting process.
+#' @param foldername A character vector corresponding to the name of the output directory.
+#' @param arguments A data frame of the user-specified inputs.
+#' @return A list (nls, nlsscore) where nls is the nlsLM model object (with some additional components)
+#' and nlsscore is the score (model RSSE) corresponding to the best fit (of the p forms).
+#' @export
+eval_model<-function(kmer_hist_orig, nls0, nls1, p, round, foldername, arguments) {
+  nls0score = -1
+  nls1score = -1
+
+  ## Evaluate the score the nls0
+  if (!is.null(nls0)) {
+    nls0score = score_model(kmer_hist_orig, nls0, round+0.1, foldername)
+
+    #if(VERBOSE) {cat(paste("nls0score$all:\t", nls0score$all[[1]], "\n"))}
+
+    if (VERBOSE) {
+      mdir = paste(foldername, "/round", round, ".1", sep="")
+      dir.create(mdir, showWarnings=FALSE)
+      report_results(kmer_prof_orig,kmer_prof_orig, k, p, (list(nls0, nls0score)) , mdir, arguments, TRUE)
+    }
+  }
+  else {
+    if (VERBOSE) {cat("nls0score failed to converge\n")}
+  }
+
+  ## Evaluate the score of nls1
+  if (!is.null(nls1)) {
+    nls1score = score_model(kmer_hist_orig, nls1, round+0.2, foldername)
+
+    if(VERBOSE) {cat(paste("nls1score$all:\t", nls1score$all[[1]], "\n"))}
+
+    if (VERBOSE) {
+      mdir = paste(foldername, "/round", round, ".2", sep="")
+      dir.create(mdir, showWarnings=FALSE)
+      report_results(kmer_prof_orig, kmer_prof_orig, k, p, (list(nls1, nls1score)) , mdir, arguments, FALSE)
+    }
+  }
+  else {
+    if (VERBOSE) {cat("nls1score failed to converge\n")}
+  }
+
+  ## Return the better of the scores
+  if (!is.null(nls0)) {
+    if (!is.null(nls1)) {
+      pdiff = abs(nls0score$all[[1]] - nls1score$all[[1]]) / max(nls0score$all[[1]], nls1score$all[[1]])
+
+      if (pdiff < SCORE_CLOSE) {
+        het0 = nls0$ahet
+        het1 = nls1$ahet
+
+        #if (het1 * SCORE_HET_FOLD_DIFFERENCE < het0) {
+        if (het1 + 0.01 < het0) {
+          if (VERBOSE) {cat("returning nls0, similar score, higher het\n")}
+          return (list(nls1, nls1score))
+        }
+        #else if (het0 * SCORE_HET_FOLD_DIFFERENCE < het1) {
+        else if (het0  + 0.01 < het1) {
+          if (VERBOSE) {cat("returning nls1, similar score, higher het\n")}
+          return (list(nls0, nls0score))
+        }
+      }
+
+      if (nls0score$all[[1]] < nls1score$all[[1]]) {
+        if (VERBOSE) {cat("returning nls0, better score\n")}
+        return (list(nls0, nls0score))
+      }
+      else {
+        if (VERBOSE) {cat("returning nls1, better score\n")}
+        return (list(nls1, nls1score))
+      }
+    }
+    else {
+      if (VERBOSE) {cat("returning nls0, nls1 fail\n")}
+      return (list(nls0, nls0score))
+    }
+  }
+
+  if (VERBOSE) {cat("returning nls1 by default\n")}
+  return (list(nls1, nls1score))
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@ user_data/*
		user_uploads/*
		!user_uploads/index.html

		.Rproj.user