From b0b55f78164de49889467dd7ef097e3f65fe5de7 Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Tue, 11 Apr 2017 12:07:18 -0500
Subject: [PATCH] anvi-export-pc-alignments. closes #494.

this is in case the user does not want to summarize thier pan database
and/or want to recover aligned gene sequences in protein clusters in an
ad hoc manner.
---
 bin/anvi-export-pc-alignments | 132 ++++++++++++++++++++++++++++++++++
 tests/run_pangenome_tests.sh  |   3 +
 2 files changed, 135 insertions(+)
 create mode 100755 bin/anvi-export-pc-alignments

diff --git a/bin/anvi-export-pc-alignments b/bin/anvi-export-pc-alignments
new file mode 100755
index 0000000000..88f678b6b4
--- /dev/null
+++ b/bin/anvi-export-pc-alignments
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# -*- coding: utf-8
+"""Export aligned sequences from anvi'o pan genomes"""
+
+import os
+import sys
+import argparse
+
+import anvio
+import anvio.dbops as dbops
+import anvio.utils as utils
+import anvio.terminal as terminal
+import anvio.filesnpaths as filesnpaths
+import anvio.summarizer as summarizer
+
+from anvio.errors import ConfigError, FilesNPathsError, DictIOError, SamplesError, HDF5Error
+
+
+__author__ = "A. Murat Eren"
+__copyright__ = "Copyright 2016, The anvio Project"
+__credits__ = []
+__license__ = "GPL 3.0"
+__version__ = anvio.__version__
+__maintainer__ = "A. Murat Eren"
+__email__ = "a.murat.eren@gmail.com"
+
+
+run = terminal.Run()
+progress = terminal.Progress()
+
+
+def main(args):
+    if args.pc_id and args.pc_ids_file:
+        raise ConfigError('You should either declare a single PC name, or PC names in a file')
+
+    if (args.pc_id or args.pc_ids_file) and args.collection_name:
+        raise ConfigError('You can either declare specific list of PCs to work with (through `--pc-id` or `--pc-ids-file`) or\
+                           go the collection way using parameters `--collection-name` and `--bin-name`. Those are not to be\
+                           mixed. If you need to know what collections are available in the pan database, use the flag \
+                           `--list-collections`.')
+
+    if not args.output_file:
+        args.output_file = os.path.join(os.path.dirname(os.path.abspath(args.pan_db)), 'protein-cluster-alignments-output-with-an-ugly-name.fa')
+
+    filesnpaths.is_output_file_writable(args.output_file)
+
+    pc_ids = set([])
+    if (args.pc_id or args.pc_ids_file):
+        if args.pc_id:
+            pc_ids = set([args.pc_id])
+            run.info('Mode', 'Reporting alignments for a single protein cluster.')
+        else:
+            columns = utils.get_columns_of_TAB_delim_file(args.pc_ids_file, include_first_column=True)
+            if len(columns) != 1:
+                raise ConfigError("The input file for PC IDs must contain a single column. It seems yours has %d :/" % len(columns))
+
+            pc_ids = set([p.strip('\n') for p in open(args.pc_ids_file, 'rU').readlines()])
+            run.info('Mode', 'Reporting alignments for a list of protein clusters from an input file.')
+    elif args.collection_name:
+        pan = summarizer.PanSummarizer(args, r=terminal.Run(verbose=False), p=terminal.Progress(verbose=False))
+
+        if not args.bin_id:
+            raise ConfigError("When you use a collection name, you must also declare a bin id :/ You don't know what bin you want? Use the flag\
+                               `--list-bins`.")
+
+        pan.collections.is_bin_in_collection(collection_name=args.collection_name, bin_name=args.bin_id)
+        collection_dict = pan.collections.get_collection_dict(args.collection_name)
+        pc_ids = set(collection_dict[args.bin_id])
+
+        run.info('Mode', 'Reporting alignments for a protein clusters from in the collection %s and bin %s.' % (args.collection_name, args.bin_id))
+    else:
+        run.info('Mode', 'Reporting alignments for all protein clusters.')
+
+    pan = dbops.PanSuperclass(args)
+    pan.init_protein_clusters()
+
+    if not pc_ids:
+        run.warning('By not specifying any criteria for protein cluster names to be reported, you elected to report everything.')
+
+        pc_ids = pan.protein_cluster_names
+
+        if len(pc_ids) > 2500:
+            run.warning('Congratulations. You have like a lot of PCs in this database. Maybe it is a good time to get a coffee or something.')
+
+    run.info('Number of protein clusters to report', len(pc_ids))
+
+    pan.get_AA_sequences_for_PCs(pc_names=pc_ids, output_file_path=args.output_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Export aligned sequences from anvi'o pan genomes")
+
+    groupA = parser.add_argument_group('INPUT FILES', "Input files from the pangenome analysis.")
+    groupA.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db', {'required': False}))
+    groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False}))
+
+    groupB = parser.add_argument_group('OUTPUT FILE', "You get to chose an output file name to report things. The default will be\
+                                       an ugly name. So, be explicit.")
+    groupB.add_argument(*anvio.A('output-file'), **anvio.K('output-file'))
+
+    groupC = parser.add_argument_group('SELECTION', "Which protein clusters should be exported. You can ask for a single PC,\
+                                       or multiple ones listed in a file, or you can use a collection and bin name to list PCs\
+                                       of interest. If you give nothing, this program will export alignments for every single PC\
+                                       found in the profile database (and this is called 'customer service').")
+    groupC.add_argument(*anvio.A('pc-id'), **anvio.K('pc-id'))
+    groupC.add_argument(*anvio.A('pc-ids-file'), **anvio.K('pc-ids-file'))
+    groupC.add_argument(*anvio.A('collection-name'), **anvio.K('collection-name'))
+    groupC.add_argument(*anvio.A('bin-id'), **anvio.K('bin-id'))
+
+    groupD = parser.add_argument_group('OTHER STUFF', "Yes. Stuff that are not like the ones above.")
+    groupD.add_argument(*anvio.A('list-collections'), **anvio.K('list-collections'))
+    groupD.add_argument(*anvio.A('list-bins'), **anvio.K('list-bins'))
+
+    args = parser.parse_args()
+
+    try:
+        main(args)
+    except ConfigError as e:
+        print(e)
+        sys.exit(-1)
+    except FilesNPathsError as e:
+        print(e)
+        sys.exit(-2)
+    except DictIOError as e:
+        print(e)
+        sys.exit(-3)
+    except SamplesError as e:
+        print(e)
+        sys.exit(-4)
+    except HDF5Error as e:
+        print(e)
+        sys.exit(-5)
diff --git a/tests/run_pangenome_tests.sh b/tests/run_pangenome_tests.sh
index 38b13485cb..0c03d734e4 100755
--- a/tests/run_pangenome_tests.sh
+++ b/tests/run_pangenome_tests.sh
@@ -53,5 +53,8 @@ anvi-summarize -p TEST/TEST-PAN.db -g TEST-GENOMES.h5 -C test_collection -o TEST
 INFO "Displaying the initial pangenome analysis results"
 anvi-display-pan -p TEST/TEST-PAN.db -s TEST/TEST-SAMPLES.db -g TEST-GENOMES.h5 --title "A mock pangenome analysis"
 
+INFO "Exporting aligned seqeunces for some protein clusters"
+anvi-export-pc-alignments -p TEST/TEST-PAN.db -g TEST-GENOMES.h5 -C test_collection -b PCB_1_CORE -o aligned_gene_sequences_in_PCB_1_CORE.fa
+
 INFO "Displaying the second pangenome analysis results"
 anvi-display-pan -p TEST/ANOTHER_TEST-PAN.db -s TEST/ANOTHER_TEST-SAMPLES.db -g TEST-GENOMES.h5 --title "A mock pangenome analysis (with --min-occurrence 2)"