diff --git a/lib/miga/cli/action/db.rb b/lib/miga/cli/action/db.rb index 7e2aa70..16e470e 100644 --- a/lib/miga/cli/action/db.rb +++ b/lib/miga/cli/action/db.rb @@ -9,7 +9,7 @@ def parse_cli cli.defaults = { database: :recommended, version: :latest, - local: File.expand_path('.miga_db', ENV['MIGA_HOME']), + local: File.join(ENV['MIGA_HOME'], '.miga_db'), host: MiGA::MiGA.known_hosts(:miga_db), pb: true, reuse_archive: false, diff --git a/lib/miga/cli/action/rm.rb b/lib/miga/cli/action/rm.rb index db75b25..3c945b1 100644 --- a/lib/miga/cli/action/rm.rb +++ b/lib/miga/cli/action/rm.rb @@ -7,9 +7,9 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action def parse_cli cli.defaults = { remove: false } cli.parse do |opt| - cli.opt_object(opt) + cli.opt_object(opt, %i[project dataset_opt result_opt]) opt.on( - '-r', '--remove', + '-R', '--remove', 'Also remove all associated files', 'By default, only unlinks from metadata' ) { |v| cli[:remove] = v } @@ -17,8 +17,13 @@ def parse_cli end def perform - d = cli.load_dataset - cli.load_project.unlink_dataset(d.name) - d.remove! if cli[:remove] + if r = cli.load_result + cli[:remove] ? r.remove! : r.unlink + elsif d = cli.load_dataset + cli.load_project.unlink_dataset(d.name) + d.remove! if cli[:remove] + else + raise "You must define one of --result or --dataset" + end end end diff --git a/lib/miga/taxonomy.rb b/lib/miga/taxonomy.rb index 648aa21..d8a45e9 100644 --- a/lib/miga/taxonomy.rb +++ b/lib/miga/taxonomy.rb @@ -137,6 +137,12 @@ def namespace self[:ns] end + ## + # Domain of the taxonomy (a String) or +nil+ + def domain + self[:d] + end + ## # Get the most general rank as a two-entry Array (rank and value). # If +force_ranks+ is true, it always returns the value for domain (d) diff --git a/scripts/aai_distances.bash b/scripts/aai_distances.bash index 57e1a13..a114b42 100755 --- a/scripts/aai_distances.bash +++ b/scripts/aai_distances.bash @@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai" miga_start_project_step "$DIR" # Extract values -rm -f miga-project.txt -SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;" -DS=$(miga ls -P "$PROJECT" --ref --no-multi --active) -( - echo "a b value sd n omega" | tr " " "\\t" - for i in $DS ; do - echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t" +function foreach_database_aai { + local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;" + local k=0 + while [[ -n ${DS[$k]} ]] ; do + echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t" + let k=$k+1 done - # The following block pipes retrieved data from all databases, reorganizes the - # names in cannonical order, and removes repeats from the first two columns, - # in order to keep only one result per pair. This is not being included into - # production, but the code may be useful for extremely large databases. - # | tee \ - # | awk -F"\t" \ - # 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \ - # | sort -k 1,2 -u -) | gzip -9c > miga-project.txt.gz +} + +function aai_tsv { + DS=($(miga ls -P "$PROJECT" --ref --no-multi --active)) + echo "a b value sd n omega" | tr " " "\\t" + if [[ ${#DS[@]} -gt 40000 ]] ; then + # Use comparisons in strictly one direction only for huge projects + foreach_database_aai \ + | awk -F"\t" 'BEGIN { OFS="\t" } + { if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \ + | sort -k 1,2 -u + else + foreach_database "$SQL" + fi +} + +rm -f "miga-project.txt" +aai_tsv | gzip -9c > "miga-project.txt.gz" # R-ify cat < "$DATASET.start" -# Check quality first +# Check quality miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2) [[ "$inactive" == "true" ]] && exit diff --git a/scripts/essential_genes.bash b/scripts/essential_genes.bash index 8d6d9c3..6e36f9e 100755 --- a/scripts/essential_genes.bash +++ b/scripts/essential_genes.bash @@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa" [[ -s "$FAA" ]] || FAA="${FAA}.gz" # Check if there are any proteins -if [[ ! -s $FAA ]] ; then +if [[ ! -s "$FAA" ]] ; then echo Empty protein set, bypassing essential genes rm "${DATASET}.start" miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false @@ -35,13 +35,31 @@ HMM.essential.rb \ -t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \ > "${DATASET}.ess/log" -# Index for FastAAI +# Index for FastAAI and classify (if needed and possible) NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \ | wc -l | awk '{print $1}') -[[ "$NOMULTI" -eq "1" ]] && \ +if [[ "$NOMULTI" -eq "1" ]] ; then python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \ --protein "$FAA" --output_crystal "${DATASET}.crystal" \ --compress + + # Classify + DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2) + if [[ "$DOMAIN" == "?" ]] ; then + REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5) + echo "Phylum-level classification against $REF_PROJ" + if [[ -n "$REF_PROJ" ]] ; then + cp "${DATASET}.start" "${DATASET}.start.bak" + miga date > "${DATASET}.done" + miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f + ruby -I "$MIGA/lib" \ + "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \ + run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ" + mv "${DATASET}.start.bak" "${DATASET}.start" + rm "${DATASET}.done" "${DATASET}.json" + fi + fi +fi # Reduce files if exists "$DATASET".ess/*.faa ; then diff --git a/utils/distance/database.rb b/utils/distance/database.rb index 8341c36..d73bd93 100644 --- a/utils/distance/database.rb +++ b/utils/distance/database.rb @@ -1,4 +1,4 @@ -require 'sqlite3' +require 'miga/sqlite' module MiGA::DistanceRunner::Database ## @@ -11,6 +11,8 @@ def initialize_dbs!(for_ref) { haai: :aai, aai: :aai, ani: :ani }.each do |m, t| @db_counts[m] = 0 @dbs[m] = for_ref ? ref_db(m) : query_db(m) + @tmp_dbs[m] = tmp_file("#{m}.db") + # Remove if corrupt if File.size?(dbs[m]) begin @@ -21,9 +23,12 @@ def initialize_dbs!(for_ref) FileUtils.rm dbs[m] end end - # Initialize if it doesn't exist - unless File.size? dbs[m] - SQLite3::Database.new(dbs[m]) do |conn| + + # Initialize if it doesn't exist, copy otherwise + if File.size? dbs[m] + FileUtils.cp(dbs[m], tmp_dbs[m]) + else + SQLite3::Database.new(tmp_dbs[m]) do |conn| conn.execute <<~SQL create table if not exists #{t}( seq1 varchar(256), seq2 varchar(256), @@ -31,10 +36,8 @@ def initialize_dbs!(for_ref) ) SQL end + FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain] end - # Copy over to (local) temporals - @tmp_dbs[m] = tmp_file("#{m}.db") - FileUtils.cp(dbs[m], tmp_dbs[m]) end end @@ -157,6 +160,20 @@ def batch_data_from_db(metric) conn.execute(sql).each { |row| data[row.shift] = row } end data + rescue => e + $stderr.puts "Database file: #{db}" if db ||= nil + raise e + end + + ## + # Retrieve the name and AAI of the closest relative from the AAI database + def closest_relative + db = tmp_dbs[:aai] + sql = 'select seq2, aai from aai order by aai desc limit 1' + MiGA::SQLite.new(db).run(sql).first + rescue => e + $stderr.puts "Database file: #{db}" if db ||= nil + raise e end ## diff --git a/utils/distance/pipeline.rb b/utils/distance/pipeline.rb index a8dec0e..7fb1ea0 100644 --- a/utils/distance/pipeline.rb +++ b/utils/distance/pipeline.rb @@ -72,22 +72,13 @@ def tax_test $stderr.puts "Testing taxonomy | opts = #{opts}" # Get taxonomy of closest relative from_ref_project = (project != ref_project) - res_dir = - from_ref_project ? - File.expand_path('data/09.distances/05.taxonomy', project.path) : - home - Dir.mkdir res_dir unless Dir.exist? res_dir - File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh| - fh.puts Time.now.to_s - end - dataset.add_result(from_ref_project ? :taxonomy : :distances, true) - cr = dataset.closest_relatives(1, from_ref_project) + cr = closest_relative return if cr.nil? or cr.empty? - tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {} + tax = ref_project.dataset(cr[0]).metadata[:tax] || {} # Run the test for each rank - tax_test = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax, engine: opts[:aai_p]) + tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p]) r = tax_test.map do |k, v| sig = '' [0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i } @@ -95,12 +86,14 @@ def tax_test end # Save test - File.open(File.expand_path("#{dataset.name}.intax.txt", home), 'w') do |fh| - fh.puts "Closest relative: #{cr[0][0]} with AAI: #{cr[0][1]}." - fh.puts '' - fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r) - fh.puts '' - fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.' + unless opts[:only_domain] + File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh| + fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}." + fh.puts '' + fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r) + fh.puts '' + fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.' + end end return r end @@ -115,6 +108,7 @@ def transfer_taxonomy(tax) .select { |i| i[1] != '?' && i[2] <= pval } .map { |i| i[0, 2].join(':') } dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a) + $stderr.puts " > #{dataset.metadata[:tax]}" dataset.save end end diff --git a/utils/distance/runner.rb b/utils/distance/runner.rb index f518047..292c4ec 100644 --- a/utils/distance/runner.rb +++ b/utils/distance/runner.rb @@ -12,8 +12,9 @@ def initialize(project_path, dataset_name, opts_hash = {}) @home = File.expand_path('data/09.distances', project.path) # Default opts - if opts[:run_taxonomy] && project.option(:ref_project) - ref_path = project.option(:ref_project) + if opts[:run_taxonomy] && + (opts[:ref_project] || project.option(:ref_project)) + ref_path = opts[:ref_project] || project.option(:ref_project) @home = File.expand_path('05.taxonomy', @home) @ref_project = MiGA::Project.load(ref_path) raise "Cannot load reference project: #{ref_path}" if @ref_project.nil? @@ -73,48 +74,55 @@ def go_query! # Initialize the databases initialize_dbs! false distances_by_request(tsk[1]) + # Calculate the classification-informed AAI/ANI traverse - results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home) - fh = File.open(results, 'w') + tmp_results = tmp_file("#{tsk[1]}-medoids.tsv") + fh = File.open(tmp_results, 'w') classif, val_cls = *classify(res.dir, '.', tsk[1], fh) fh.close - # Calculate all the AAIs/ANIs against the lowest subclade (if classified) - par_dir = File.dirname(File.expand_path(classif, res.dir)) - par = File.expand_path('miga-project.classif', par_dir) - closest = { dataset: nil, ani: 0.0 } - sbj_datasets = [] - if File.size? par - File.open(par, 'r') do |fh| - fh.each_line do |ln| - r = ln.chomp.split("\t") - sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls + unless opts[:only_domain] + results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv") + FileUtils.move(tmp_results, results) + + # Calculate all the AAIs/ANIs against the lowest subclade (if classified) + par_dir = File.dirname(File.expand_path(classif, res.dir)) + par = File.expand_path('miga-project.classif', par_dir) + closest = { dataset: nil, ani: 0.0 } + sbj_datasets = [] + if File.size? par + File.open(par, 'r') do |fh| + fh.each_line do |ln| + r = ln.chomp.split("\t") + sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls + end end + ani = ani_after_aai(sbj_datasets, 80.0) + ani_max = ani.map(&:to_f).each_with_index.max + closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] } end - ani = ani_after_aai(sbj_datasets, 80.0) - ani_max = ani.map(&:to_f).each_with_index.max - closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] } - end - # Calculate all the AAIs/ANIs against the closest ANI95-clade (if AAI > 80%) - cl_path = res.file_path :clades_ani95 - if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding - clades = File.foreach(cl_path).map { |i| i.chomp.split(',') } - sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) } - sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) } - ani_after_aai(sbj_datasets, 80.0) if sbj_datasets + # Calculate all the AAIs/ANIs against the closest ANI95-clade + # (if AAI > 80%) + cl_path = res.file_path :clades_ani95 + if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding + clades = File.foreach(cl_path).map { |i| i.chomp.split(',') } + sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) } + sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) } + ani_after_aai(sbj_datasets, 80.0) if sbj_datasets + end end # Finalize [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 } - build_medoids_tree(tsk[1]) + build_medoids_tree(tsk[1]) unless opts[:only_domain] transfer_taxonomy(tax_test) end # Launch analysis for taxonomy jobs def go_taxonomy! $stderr.puts 'Launching taxonomy analysis' - return unless project.option(:ref_project) + return unless opts[:ref_project] || project.option(:ref_project) go_query! # <- yeah, it's actually the same, just different ref_project end diff --git a/utils/distance/temporal.rb b/utils/distance/temporal.rb index 21d9b12..d918a21 100644 --- a/utils/distance/temporal.rb +++ b/utils/distance/temporal.rb @@ -41,6 +41,7 @@ def checkpoint(metric) # Copies temporal databases back to the MiGA Project def checkpoint!(metric) + return if opts[:only_domain] $stderr.puts "Checkpoint (metric = #{metric})" # This is simply to test database consistency before overwriting the