Implement pre-classification for quality

- Close #157 - Close #160
bio-miga · Apr 25, 2023 · 81efea3 · 81efea3
1 parent 3b876d9
commit 81efea3
Show file tree

Hide file tree

Showing 10 changed files with 135 additions and 78 deletions.
diff --git a/lib/miga/cli/action/db.rb b/lib/miga/cli/action/db.rb
@@ -9,7 +9,7 @@ def parse_cli
     cli.defaults = {
       database: :recommended,
       version: :latest,
-      local: File.expand_path('.miga_db', ENV['MIGA_HOME']),
+      local: File.join(ENV['MIGA_HOME'], '.miga_db'),
       host: MiGA::MiGA.known_hosts(:miga_db),
       pb: true,
       reuse_archive: false,

diff --git a/lib/miga/cli/action/rm.rb b/lib/miga/cli/action/rm.rb
@@ -7,18 +7,23 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
   def parse_cli
     cli.defaults = { remove: false }
     cli.parse do |opt|
-      cli.opt_object(opt)
+      cli.opt_object(opt, %i[project dataset_opt result_opt])
       opt.on(
-        '-r', '--remove',
+        '-R', '--remove',
         'Also remove all associated files',
         'By default, only unlinks from metadata'
       ) { |v| cli[:remove] = v }
     end
   end
 
   def perform
-    d = cli.load_dataset
-    cli.load_project.unlink_dataset(d.name)
-    d.remove! if cli[:remove]
+    if r = cli.load_result
+      cli[:remove] ? r.remove! : r.unlink
+    elsif d = cli.load_dataset
+      cli.load_project.unlink_dataset(d.name)
+      d.remove! if cli[:remove]
+    else
+      raise "You must define one of --result or --dataset"
+    end
   end
 end
diff --git a/lib/miga/taxonomy.rb b/lib/miga/taxonomy.rb
@@ -137,6 +137,12 @@ def namespace
     self[:ns]
   end
 
+  ##
+  # Domain of the taxonomy (a String) or +nil+
+  def domain
+    self[:d]
+  end
+
   ##
   # Get the most general rank as a two-entry Array (rank and value).
   # If +force_ranks+ is true, it always returns the value for domain (d)

diff --git a/scripts/aai_distances.bash b/scripts/aai_distances.bash
@@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
 miga_start_project_step "$DIR"
 
 # Extract values
-rm -f miga-project.txt
-SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
-DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
-(
-  echo "a b value sd n omega" | tr " " "\\t"
-  for i in $DS ; do
-    echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
+function foreach_database_aai {
+  local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
+  local k=0
+  while [[ -n ${DS[$k]} ]] ; do
+    echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
+    let k=$k+1
   done
-  # The following block pipes retrieved data from all databases, reorganizes the
-  # names in cannonical order, and removes repeats from the first two columns,
-  # in order to keep only one result per pair. This is not being included into
-  # production, but the code may be useful for extremely large databases.
-  # | tee \
-  # | awk -F"\t" \
-  #   'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
-  # | sort -k 1,2 -u
-) | gzip -9c > miga-project.txt.gz
+}
+
+function aai_tsv {
+  DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
+  echo "a b value sd n omega" | tr " " "\\t"
+  if [[ ${#DS[@]} -gt 40000 ]] ; then
+    # Use comparisons in strictly one direction only for huge projects
+    foreach_database_aai \
+      | awk -F"\t" 'BEGIN { OFS="\t" }
+          { if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
+      | sort -k 1,2 -u
+  else
+    foreach_database "$SQL"
+  fi
+}
+
+rm -f "miga-project.txt"
+aai_tsv | gzip -9c > "miga-project.txt.gz"
 
 # R-ify
 cat <<R | R --vanilla

diff --git a/scripts/distances.bash b/scripts/distances.bash
@@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
 # Initialize
 miga date > "$DATASET.start"
 
-# Check quality first
+# Check quality
 miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
 inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
 [[ "$inactive" == "true" ]] && exit

diff --git a/scripts/essential_genes.bash b/scripts/essential_genes.bash
@@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
 [[ -s "$FAA" ]] || FAA="${FAA}.gz"
 
 # Check if there are any proteins
-if [[ ! -s $FAA ]] ; then
+if [[ ! -s "$FAA" ]] ; then
   echo Empty protein set, bypassing essential genes
   rm "${DATASET}.start"
   miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
@@ -35,13 +35,31 @@ HMM.essential.rb \
   -t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
   > "${DATASET}.ess/log"
 
-# Index for FastAAI
+# Index for FastAAI and classify (if needed and possible)
 NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
             | wc -l | awk '{print $1}')
-[[ "$NOMULTI" -eq "1" ]] && \
+if [[ "$NOMULTI" -eq "1" ]] ; then
   python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
     --protein "$FAA" --output_crystal "${DATASET}.crystal" \
     --compress
+
+  # Classify
+  DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
+  if [[ "$DOMAIN" == "?" ]] ; then
+    REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
+    echo "Phylum-level classification against $REF_PROJ"
+    if [[ -n "$REF_PROJ" ]] ; then
+      cp "${DATASET}.start" "${DATASET}.start.bak"
+      miga date > "${DATASET}.done"
+      miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
+      ruby -I "$MIGA/lib" \
+        "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
+        run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
+      mv "${DATASET}.start.bak" "${DATASET}.start"
+      rm "${DATASET}.done" "${DATASET}.json"
+    fi
+  fi
+fi
 
 # Reduce files
 if exists "$DATASET".ess/*.faa ; then

diff --git a/utils/distance/database.rb b/utils/distance/database.rb
@@ -1,4 +1,4 @@
-require 'sqlite3'
+require 'miga/sqlite'
 
 module MiGA::DistanceRunner::Database
   ##
@@ -11,6 +11,8 @@ def initialize_dbs!(for_ref)
     { haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
       @db_counts[m] = 0
       @dbs[m] = for_ref ? ref_db(m) : query_db(m)
+      @tmp_dbs[m] = tmp_file("#{m}.db")
+
       # Remove if corrupt
       if File.size?(dbs[m])
         begin
@@ -21,20 +23,21 @@ def initialize_dbs!(for_ref)
           FileUtils.rm dbs[m]
         end
       end
-      # Initialize if it doesn't exist
-      unless File.size? dbs[m]
-        SQLite3::Database.new(dbs[m]) do |conn|
+
+      # Initialize if it doesn't exist, copy otherwise
+      if File.size? dbs[m]
+        FileUtils.cp(dbs[m], tmp_dbs[m])
+      else
+        SQLite3::Database.new(tmp_dbs[m]) do |conn|
           conn.execute <<~SQL
             create table if not exists #{t}(
               seq1 varchar(256), seq2 varchar(256),
               #{t} float, sd float, n int, omega int
             )
           SQL
         end
+        FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
       end
-      # Copy over to (local) temporals
-      @tmp_dbs[m] = tmp_file("#{m}.db")
-      FileUtils.cp(dbs[m], tmp_dbs[m])
     end
   end
 
@@ -157,6 +160,20 @@ def batch_data_from_db(metric)
       conn.execute(sql).each { |row| data[row.shift] = row }
     end
     data
+  rescue => e
+    $stderr.puts "Database file: #{db}" if db ||= nil
+    raise e
+  end
+
+  ##
+  # Retrieve the name and AAI of the closest relative from the AAI database
+  def closest_relative
+    db = tmp_dbs[:aai]
+    sql = 'select seq2, aai from aai order by aai desc limit 1'
+    MiGA::SQLite.new(db).run(sql).first
+  rescue => e
+    $stderr.puts "Database file: #{db}" if db ||= nil
+    raise e
   end
 
   ##

diff --git a/utils/distance/pipeline.rb b/utils/distance/pipeline.rb
@@ -72,35 +72,28 @@ def tax_test
     $stderr.puts "Testing taxonomy | opts = #{opts}"
     # Get taxonomy of closest relative
     from_ref_project = (project != ref_project)
-    res_dir =
-      from_ref_project ?
-        File.expand_path('data/09.distances/05.taxonomy', project.path) :
-        home
-    Dir.mkdir res_dir unless Dir.exist? res_dir
-    File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
-      fh.puts Time.now.to_s
-    end
-    dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
-    cr = dataset.closest_relatives(1, from_ref_project)
+    cr = closest_relative
     return if cr.nil? or cr.empty?
 
-    tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
+    tax = ref_project.dataset(cr[0]).metadata[:tax] || {}
 
     # Run the test for each rank
-    tax_test = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax, engine: opts[:aai_p])
+    tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
     r = tax_test.map do |k, v|
       sig = ''
       [0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
       [MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || '?'), v, sig]
     end
 
     # Save test
-    File.open(File.expand_path("#{dataset.name}.intax.txt", home), 'w') do |fh|
-      fh.puts "Closest relative: #{cr[0][0]} with AAI: #{cr[0][1]}."
-      fh.puts ''
-      fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
-      fh.puts ''
-      fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
+    unless opts[:only_domain]
+      File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
+        fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
+        fh.puts ''
+        fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
+        fh.puts ''
+        fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
+      end
     end
     return r
   end
@@ -115,6 +108,7 @@ def transfer_taxonomy(tax)
             .select { |i| i[1] != '?' && i[2] <= pval }
             .map { |i| i[0, 2].join(':') }
     dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
+    $stderr.puts " > #{dataset.metadata[:tax]}"
     dataset.save
   end
 end
diff --git a/utils/distance/runner.rb b/utils/distance/runner.rb
@@ -12,8 +12,9 @@ def initialize(project_path, dataset_name, opts_hash = {})
     @home = File.expand_path('data/09.distances', project.path)
 
     # Default opts
-    if opts[:run_taxonomy] && project.option(:ref_project)
-      ref_path = project.option(:ref_project)
+    if opts[:run_taxonomy] &&
+          (opts[:ref_project] || project.option(:ref_project))
+      ref_path = opts[:ref_project] || project.option(:ref_project)
       @home = File.expand_path('05.taxonomy', @home)
       @ref_project = MiGA::Project.load(ref_path)
       raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
@@ -73,48 +74,55 @@ def go_query!
     # Initialize the databases
     initialize_dbs! false
     distances_by_request(tsk[1])
+
     # Calculate the classification-informed AAI/ANI traverse
-    results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
-    fh = File.open(results, 'w')
+    tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
+    fh = File.open(tmp_results, 'w')
     classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
     fh.close
 
-    # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
-    par_dir = File.dirname(File.expand_path(classif, res.dir))
-    par = File.expand_path('miga-project.classif', par_dir)
-    closest = { dataset: nil, ani: 0.0 }
-    sbj_datasets = []
-    if File.size? par
-      File.open(par, 'r') do |fh|
-        fh.each_line do |ln|
-          r = ln.chomp.split("\t")
-          sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
+    unless opts[:only_domain]
+      results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
+      FileUtils.move(tmp_results, results)
+
+      # Calculate all the AAIs/ANIs against the lowest subclade (if classified)
+      par_dir = File.dirname(File.expand_path(classif, res.dir))
+      par = File.expand_path('miga-project.classif', par_dir)
+      closest = { dataset: nil, ani: 0.0 }
+      sbj_datasets = []
+      if File.size? par
+        File.open(par, 'r') do |fh|
+          fh.each_line do |ln|
+            r = ln.chomp.split("\t")
+            sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
+          end
         end
+        ani = ani_after_aai(sbj_datasets, 80.0)
+        ani_max = ani.map(&:to_f).each_with_index.max
+        closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
       end
-      ani = ani_after_aai(sbj_datasets, 80.0)
-      ani_max = ani.map(&:to_f).each_with_index.max
-      closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
-    end
 
-    # Calculate all the AAIs/ANIs against the closest ANI95-clade (if AAI > 80%)
-    cl_path = res.file_path :clades_ani95
-    if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
-      clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
-      sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
-      sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
-      ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
+      # Calculate all the AAIs/ANIs against the closest ANI95-clade
+      # (if AAI > 80%)
+      cl_path = res.file_path :clades_ani95
+      if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
+        clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
+        sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
+        sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
+        ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
+      end
     end
 
     # Finalize
     [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
-    build_medoids_tree(tsk[1])
+    build_medoids_tree(tsk[1]) unless opts[:only_domain]
     transfer_taxonomy(tax_test)
   end
 
   # Launch analysis for taxonomy jobs
   def go_taxonomy!
     $stderr.puts 'Launching taxonomy analysis'
-    return unless project.option(:ref_project)
+    return unless opts[:ref_project] || project.option(:ref_project)
 
     go_query! # <- yeah, it's actually the same, just different ref_project
   end

diff --git a/utils/distance/temporal.rb b/utils/distance/temporal.rb
@@ -41,6 +41,7 @@ def checkpoint(metric)
 
   # Copies temporal databases back to the MiGA Project
   def checkpoint!(metric)
+    return if opts[:only_domain]
     $stderr.puts "Checkpoint (metric = #{metric})"
 
     # This is simply to test database consistency before overwriting the