Skip to content

Commit

Permalink
Implement pre-classification for quality
Browse files Browse the repository at this point in the history
- Close #157
- Close #160
  • Loading branch information
lmrodriguezr committed Apr 25, 2023
1 parent 3b876d9 commit 81efea3
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 78 deletions.
2 changes: 1 addition & 1 deletion lib/miga/cli/action/db.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def parse_cli
cli.defaults = {
database: :recommended,
version: :latest,
local: File.expand_path('.miga_db', ENV['MIGA_HOME']),
local: File.join(ENV['MIGA_HOME'], '.miga_db'),
host: MiGA::MiGA.known_hosts(:miga_db),
pb: true,
reuse_archive: false,
Expand Down
15 changes: 10 additions & 5 deletions lib/miga/cli/action/rm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,23 @@ class MiGA::Cli::Action::Rm < MiGA::Cli::Action
def parse_cli
cli.defaults = { remove: false }
cli.parse do |opt|
cli.opt_object(opt)
cli.opt_object(opt, %i[project dataset_opt result_opt])
opt.on(
'-r', '--remove',
'-R', '--remove',
'Also remove all associated files',
'By default, only unlinks from metadata'
) { |v| cli[:remove] = v }
end
end

def perform
d = cli.load_dataset
cli.load_project.unlink_dataset(d.name)
d.remove! if cli[:remove]
if r = cli.load_result
cli[:remove] ? r.remove! : r.unlink
elsif d = cli.load_dataset
cli.load_project.unlink_dataset(d.name)
d.remove! if cli[:remove]
else
raise "You must define one of --result or --dataset"
end
end
end
6 changes: 6 additions & 0 deletions lib/miga/taxonomy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ def namespace
self[:ns]
end

##
# Domain of the taxonomy (a String) or +nil+
def domain
self[:d]
end

##
# Get the most general rank as a two-entry Array (rank and value).
# If +force_ranks+ is true, it always returns the value for domain (d)
Expand Down
40 changes: 24 additions & 16 deletions scripts/aai_distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,31 @@ DIR="$PROJECT/data/09.distances/02.aai"
miga_start_project_step "$DIR"

# Extract values
rm -f miga-project.txt
SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
(
echo "a b value sd n omega" | tr " " "\\t"
for i in $DS ; do
echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
function foreach_database_aai {
local SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
local k=0
while [[ -n ${DS[$k]} ]] ; do
echo "$SQL" | sqlite3 "$DIR/${DS[$k]}.db" | tr "\\|" "\\t"
let k=$k+1
done
# The following block pipes retrieved data from all databases, reorganizes the
# names in cannonical order, and removes repeats from the first two columns,
# in order to keep only one result per pair. This is not being included into
# production, but the code may be useful for extremely large databases.
# | tee \
# | awk -F"\t" \
# 'BEGIN { OFS="\t" } { if($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
# | sort -k 1,2 -u
) | gzip -9c > miga-project.txt.gz
}

function aai_tsv {
DS=($(miga ls -P "$PROJECT" --ref --no-multi --active))
echo "a b value sd n omega" | tr " " "\\t"
if [[ ${#DS[@]} -gt 40000 ]] ; then
# Use comparisons in strictly one direction only for huge projects
foreach_database_aai \
| awk -F"\t" 'BEGIN { OFS="\t" }
{ if ($1 > $2) { a=$1; $1=$2; $2=a; } } { print $0 }' \
| sort -k 1,2 -u
else
foreach_database "$SQL"
fi
}

rm -f "miga-project.txt"
aai_tsv | gzip -9c > "miga-project.txt.gz"

# R-ify
cat <<R | R --vanilla
Expand Down
2 changes: 1 addition & 1 deletion scripts/distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ cd "$PROJECT/data/09.distances"
# Initialize
miga date > "$DATASET.start"

# Check quality first
# Check quality
miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
[[ "$inactive" == "true" ]] && exit
Expand Down
24 changes: 21 additions & 3 deletions scripts/essential_genes.bash
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ FAA="../../../06.cds/${DATASET}.faa"
[[ -s "$FAA" ]] || FAA="${FAA}.gz"

# Check if there are any proteins
if [[ ! -s $FAA ]] ; then
if [[ ! -s "$FAA" ]] ; then
echo Empty protein set, bypassing essential genes
rm "${DATASET}.start"
miga edit -P "$PROJECT" -D "$DATASET" -m run_essential_genes=false
Expand All @@ -35,13 +35,31 @@ HMM.essential.rb \
-t "$CORES" -r "$DATASET" --collection "$COLL" $FLAGS \
> "${DATASET}.ess/log"

# Index for FastAAI
# Index for FastAAI and classify (if needed and possible)
NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
| wc -l | awk '{print $1}')
[[ "$NOMULTI" -eq "1" ]] && \
if [[ "$NOMULTI" -eq "1" ]] ; then
python3 "$MIGA/utils/FastAAI/fastaai/fastaai_miga_preproc.py" \
--protein "$FAA" --output_crystal "${DATASET}.crystal" \
--compress

# Classify
DOMAIN=$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | cut -f 2)
if [[ "$DOMAIN" == "?" ]] ; then
REF_PROJ=$(miga db --list-local -n Phyla_Lite --tab | tail -n +2 | cut -f 5)
echo "Phylum-level classification against $REF_PROJ"
if [[ -n "$REF_PROJ" ]] ; then
cp "${DATASET}.start" "${DATASET}.start.bak"
miga date > "${DATASET}.done"
miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
ruby -I "$MIGA/lib" \
"$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" \
run_taxonomy=1 only_domain=1 "ref_project=$REF_PROJ"
mv "${DATASET}.start.bak" "${DATASET}.start"
rm "${DATASET}.done" "${DATASET}.json"
fi
fi
fi

# Reduce files
if exists "$DATASET".ess/*.faa ; then
Expand Down
31 changes: 24 additions & 7 deletions utils/distance/database.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
require 'sqlite3'
require 'miga/sqlite'

module MiGA::DistanceRunner::Database
##
Expand All @@ -11,6 +11,8 @@ def initialize_dbs!(for_ref)
{ haai: :aai, aai: :aai, ani: :ani }.each do |m, t|
@db_counts[m] = 0
@dbs[m] = for_ref ? ref_db(m) : query_db(m)
@tmp_dbs[m] = tmp_file("#{m}.db")

# Remove if corrupt
if File.size?(dbs[m])
begin
Expand All @@ -21,20 +23,21 @@ def initialize_dbs!(for_ref)
FileUtils.rm dbs[m]
end
end
# Initialize if it doesn't exist
unless File.size? dbs[m]
SQLite3::Database.new(dbs[m]) do |conn|

# Initialize if it doesn't exist, copy otherwise
if File.size? dbs[m]
FileUtils.cp(dbs[m], tmp_dbs[m])
else
SQLite3::Database.new(tmp_dbs[m]) do |conn|
conn.execute <<~SQL
create table if not exists #{t}(
seq1 varchar(256), seq2 varchar(256),
#{t} float, sd float, n int, omega int
)
SQL
end
FileUtils.cp(tmp_dbs[m], dbs[m]) unless opts[:only_domain]
end
# Copy over to (local) temporals
@tmp_dbs[m] = tmp_file("#{m}.db")
FileUtils.cp(dbs[m], tmp_dbs[m])
end
end

Expand Down Expand Up @@ -157,6 +160,20 @@ def batch_data_from_db(metric)
conn.execute(sql).each { |row| data[row.shift] = row }
end
data
rescue => e
$stderr.puts "Database file: #{db}" if db ||= nil
raise e
end

##
# Retrieve the name and AAI of the closest relative from the AAI database
def closest_relative
db = tmp_dbs[:aai]
sql = 'select seq2, aai from aai order by aai desc limit 1'
MiGA::SQLite.new(db).run(sql).first
rescue => e
$stderr.puts "Database file: #{db}" if db ||= nil
raise e
end

##
Expand Down
30 changes: 12 additions & 18 deletions utils/distance/pipeline.rb
Original file line number Diff line number Diff line change
Expand Up @@ -72,35 +72,28 @@ def tax_test
$stderr.puts "Testing taxonomy | opts = #{opts}"
# Get taxonomy of closest relative
from_ref_project = (project != ref_project)
res_dir =
from_ref_project ?
File.expand_path('data/09.distances/05.taxonomy', project.path) :
home
Dir.mkdir res_dir unless Dir.exist? res_dir
File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
fh.puts Time.now.to_s
end
dataset.add_result(from_ref_project ? :taxonomy : :distances, true)
cr = dataset.closest_relatives(1, from_ref_project)
cr = closest_relative
return if cr.nil? or cr.empty?

tax = ref_project.dataset(cr[0][0]).metadata[:tax] || {}
tax = ref_project.dataset(cr[0]).metadata[:tax] || {}

# Run the test for each rank
tax_test = MiGA::TaxDist.aai_pvalues(cr[0][1], :intax, engine: opts[:aai_p])
tax_test = MiGA::TaxDist.aai_pvalues(cr[1], :intax, engine: opts[:aai_p])
r = tax_test.map do |k, v|
sig = ''
[0.5, 0.1, 0.05, 0.01].each { |i| sig << '*' if v < i }
[MiGA::Taxonomy.LONG_RANKS[k], (tax[k] || '?'), v, sig]
end

# Save test
File.open(File.expand_path("#{dataset.name}.intax.txt", home), 'w') do |fh|
fh.puts "Closest relative: #{cr[0][0]} with AAI: #{cr[0][1]}."
fh.puts ''
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
fh.puts ''
fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
unless opts[:only_domain]
File.open(File.join(home, "#{dataset.name}.intax.txt"), 'w') do |fh|
fh.puts "Closest relative: #{cr[0]} with AAI: #{cr[1]}."
fh.puts ''
fh.puts MiGA::MiGA.tabulate(%w[Rank Taxonomy P-value Signif.], r)
fh.puts ''
fh.puts 'Significance at p-value below: *0.5, **0.1, ***0.05, ****0.01.'
end
end
return r
end
Expand All @@ -115,6 +108,7 @@ def transfer_taxonomy(tax)
.select { |i| i[1] != '?' && i[2] <= pval }
.map { |i| i[0, 2].join(':') }
dataset.metadata[:tax] = MiGA::Taxonomy.new(tax_a)
$stderr.puts " > #{dataset.metadata[:tax]}"
dataset.save
end
end
62 changes: 35 additions & 27 deletions utils/distance/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def initialize(project_path, dataset_name, opts_hash = {})
@home = File.expand_path('data/09.distances', project.path)

# Default opts
if opts[:run_taxonomy] && project.option(:ref_project)
ref_path = project.option(:ref_project)
if opts[:run_taxonomy] &&
(opts[:ref_project] || project.option(:ref_project))
ref_path = opts[:ref_project] || project.option(:ref_project)
@home = File.expand_path('05.taxonomy', @home)
@ref_project = MiGA::Project.load(ref_path)
raise "Cannot load reference project: #{ref_path}" if @ref_project.nil?
Expand Down Expand Up @@ -73,48 +74,55 @@ def go_query!
# Initialize the databases
initialize_dbs! false
distances_by_request(tsk[1])

# Calculate the classification-informed AAI/ANI traverse
results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
fh = File.open(results, 'w')
tmp_results = tmp_file("#{tsk[1]}-medoids.tsv")
fh = File.open(tmp_results, 'w')
classif, val_cls = *classify(res.dir, '.', tsk[1], fh)
fh.close

# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
par_dir = File.dirname(File.expand_path(classif, res.dir))
par = File.expand_path('miga-project.classif', par_dir)
closest = { dataset: nil, ani: 0.0 }
sbj_datasets = []
if File.size? par
File.open(par, 'r') do |fh|
fh.each_line do |ln|
r = ln.chomp.split("\t")
sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
unless opts[:only_domain]
results = File.join(home, "#{dataset.name}.#{tsk[1]}-medoids.tsv")
FileUtils.move(tmp_results, results)

# Calculate all the AAIs/ANIs against the lowest subclade (if classified)
par_dir = File.dirname(File.expand_path(classif, res.dir))
par = File.expand_path('miga-project.classif', par_dir)
closest = { dataset: nil, ani: 0.0 }
sbj_datasets = []
if File.size? par
File.open(par, 'r') do |fh|
fh.each_line do |ln|
r = ln.chomp.split("\t")
sbj_datasets << ref_project.dataset(r[0]) if r[1].to_i == val_cls
end
end
ani = ani_after_aai(sbj_datasets, 80.0)
ani_max = ani.map(&:to_f).each_with_index.max
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
end
ani = ani_after_aai(sbj_datasets, 80.0)
ani_max = ani.map(&:to_f).each_with_index.max
closest = { ds: sbj_datasets[ani_max[1]].name, ani: ani_max[0] }
end

# Calculate all the AAIs/ANIs against the closest ANI95-clade (if AAI > 80%)
cl_path = res.file_path :clades_ani95
if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
# Calculate all the AAIs/ANIs against the closest ANI95-clade
# (if AAI > 80%)
cl_path = res.file_path :clades_ani95
if !cl_path.nil? && File.size?(cl_path) && tsk[0] == :clade_finding
clades = File.foreach(cl_path).map { |i| i.chomp.split(',') }
sbj_dataset_names = clades.find { |i| i.include?(closest[:ds]) }
sbj_datasets = sbj_dataset_names&.map { |i| ref_project.dataset(i) }
ani_after_aai(sbj_datasets, 80.0) if sbj_datasets
end
end

# Finalize
[:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
build_medoids_tree(tsk[1])
build_medoids_tree(tsk[1]) unless opts[:only_domain]
transfer_taxonomy(tax_test)
end

# Launch analysis for taxonomy jobs
def go_taxonomy!
$stderr.puts 'Launching taxonomy analysis'
return unless project.option(:ref_project)
return unless opts[:ref_project] || project.option(:ref_project)

go_query! # <- yeah, it's actually the same, just different ref_project
end
Expand Down
1 change: 1 addition & 0 deletions utils/distance/temporal.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def checkpoint(metric)

# Copies temporal databases back to the MiGA Project
def checkpoint!(metric)
return if opts[:only_domain]
$stderr.puts "Checkpoint (metric = #{metric})"

# This is simply to test database consistency before overwriting the
Expand Down

0 comments on commit 81efea3

Please sign in to comment.