Merge pull request #31 from genotoul-bioinfo/add_protein_input

Provide Precomputed Protein Sequences
genotoul-bioinfo · Nov 27, 2024 · 6a9cf4a · 6a9cf4a
2 parents 44a4bac + a04a6c0
commit 6a9cf4a
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 41 deletions.
diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml
@@ -96,3 +96,16 @@ jobs:
         python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv
 
 
+    - name: Run simple test case from bin dirs and with proteins input
+      run: |
+        cd test_data
+        binette -d binning_results/A/ binning_results/B/ binning_results/C/ \
+                --contigs all_contigs.fna --checkm2_db checkm2_tiny_db/checkm2_tiny_db.dmnd  -v -o test_results_from_dirs_and_prot_input --proteins proteins.faa
+
+    - name: Compare results from bin dirs with expectation
+      run: |
+        cd test_data
+        head  expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
+        python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
+
+
diff --git a/binette/cds.py b/binette/cds.py
@@ -71,19 +71,59 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
         for contig_id, genes in contig_to_genes:
             genes.write_translations(fl, contig_id)
 
+
+def is_nucleic_acid(sequence: str) -> bool:
+    """
+    Determines whether the given sequence is a DNA or RNA sequence.
+
+    :param sequence: The sequence to check.
+    :return: True if the sequence is a DNA or RNA sequence, False otherwise.
+    """
+    # Define nucleotidic bases (DNA and RNA)
+    nucleotidic_bases = set('ATCGNUatcgnu')
+
+    # Check if all characters in the sequence are valid nucleotidic bases (DNA or RNA)
+    if all(base in nucleotidic_bases for base in sequence):
+        return True
+
+    # If any character is invalid, return False
+    return False
+
+
+
 def parse_faa_file(faa_file: str) -> Dict[str, List[str]]:
     """
     Parse a FASTA file containing protein sequences and organize them by contig.
 
     :param faa_file: Path to the input FASTA file.
     :return: A dictionary mapping contig names to lists of protein sequences.
+    :raises ValueError: If the file contains nucleotidic sequences instead of protein sequences.
     """
     contig_to_genes = defaultdict(list)
+    checked_sequences = []
+
+    # Iterate through the FASTA file and parse sequences
     for name, seq in pyfastx.Fastx(faa_file):
         contig = get_contig_from_cds_name(name)
         contig_to_genes[contig].append(seq)
+
+        # Concatenate up to the first 20 sequences for validation
+        if len(checked_sequences) < 20:
+            checked_sequences.append(seq)
+
+    # Concatenate all checked sequences for a more reliable nucleic acid check
+    concatenated_seq = "".join(checked_sequences)
+
+    # Check if the concatenated sequence appears to be nucleic acid
+    if is_nucleic_acid(concatenated_seq):
+        raise ValueError(
+            f"The file '{faa_file}' appears to contain nucleotide sequences. "
+            "Ensure that the file contains valid protein sequences in FASTA format."
+        )
 
     return dict(contig_to_genes)
+
+
 
 def get_aa_composition(genes: List[str]) -> Counter:
     """

diff --git a/binette/io_manager.py b/binette/io_manager.py
@@ -167,8 +167,8 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str],
 
     issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly))
 
-    message = f"{issue_countigs} contigs found in file {elsewhere_file} \
-                were not found in assembly_file ({assembly_file})."
+    message = (f"{issue_countigs} contigs found in file '{elsewhere_file}' "
+    f"were not found in assembly_file '{assembly_file}'")
     assert are_contigs_consistent, message
 
 

diff --git a/binette/main.py b/binette/main.py
@@ -68,6 +68,21 @@ def __call__(
         setattr(namespace, self.dest, values)
 
 
+def is_valid_file(parser: ArgumentParser, arg: str) -> Path:
+    """
+    Validates that the provided input file exists.
+
+    :param parser: The ArgumentParser instance handling command-line arguments.
+    :param arg: The path to the file provided as an argument.
+    :return: A Path object representing the valid file.
+    """
+    path_arg = Path(arg)
+
+    # Check if the file exists at the provided path
+    if not path_arg.exists():
+        parser.error(f"Error: The specified file '{arg}' does not exist.")
+
+    return path_arg
 
 def parse_arguments(args):
     """Parse script arguments."""
@@ -85,7 +100,7 @@ def parse_arguments(args):
         "-d",
         "--bin_dirs",
         nargs="+",
-        type=Path,
+        type=lambda x: is_valid_file(parser, x),
         action=UniqueStore,
         help="List of bin folders containing each bin in a fasta file.",
     )
@@ -95,12 +110,22 @@ def parse_arguments(args):
         "--contig2bin_tables",
         nargs="+",
         action=UniqueStore,
-        type=Path,
+        type=lambda x: is_valid_file(parser, x),
         help="List of contig2bin table with two columns separated\
             with a tabulation: contig, bin",
     )
 
-    input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.")
+    input_group.add_argument("-c", "--contigs", required=True, 
+                             type=lambda x: is_valid_file(parser, x),
+                               help="Contigs in fasta format.")
+
+    input_group.add_argument(
+        "-p", "--proteins",
+        type=lambda x: is_valid_file(parser, x),
+        help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). "
+            "Skips the gene prediction step if provided."
+    )
+
 
     # Other parameters category
     other_group = parser.add_argument_group('Other Arguments')
@@ -211,7 +236,9 @@ def parse_input_files(bin_dirs: List[Path],
 
 def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int],
                                 contigs_in_bins: Set[str], diamond_result_file: Path,
-                                checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
+                                checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, 
+                                resume_diamond:bool,
+                                low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
     """
     Predicts or reuses proteins prediction and runs diamond on them.
     
@@ -222,14 +249,15 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
     :param diamond_result_file: The path to the diamond result file.
     :param checkm2_db: The path to the CheckM2 database.
     :param threads: Number of threads for parallel processing.
-    :param resume: Boolean indicating whether to resume the process.
+    :param use_existing_protein_file: Boolean indicating whether to use an existing protein file.
+    :param resume_diamond: Boolean indicating whether to resume diamond alignement.
     :param low_mem: Boolean indicating whether to use low memory mode.
 
     :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
     """
 
     # Predict or reuse proteins prediction and run diamond on them
-    if resume:
+    if use_existing_protein_file:
         logging.info(f"Parsing faa file: {faa_file}.")
         contig_to_genes = cds.parse_faa_file(faa_file.as_posix())
         io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix())
@@ -238,6 +266,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
         contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins)
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
 
+    if not resume_diamond:
         if checkm2_db is None:
             # get checkm2 db stored in checkm2 install
             diamond_db_path = diamond.get_checkm2_db()
@@ -360,8 +389,17 @@ def main():
     # Temporary files #
     out_tmp_dir:Path = args.outdir / "temporary_files"
     os.makedirs(out_tmp_dir, exist_ok=True)
+
+    use_existing_protein_file = False
+
+    if args.proteins:
+        logging.info(f"Using the provided protein sequences file: {args.proteins}")
+        faa_file = args.proteins
+        use_existing_protein_file = True
+    else:
+        faa_file = out_tmp_dir / "assembly_proteins.faa"
+
 
-    faa_file = out_tmp_dir / "assembly_proteins.faa"
     diamond_result_file = out_tmp_dir / "diamond_result.tsv"
 
     # Output files #
@@ -370,13 +408,15 @@ def main():
 
     if args.resume:
         io.check_resume_file(faa_file, diamond_result_file)
+        use_existing_protein_file = True
 
     bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions))
 
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
                                                                         contigs_in_bins=contigs_in_bins,
                                                                         diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
-                                                                        threads=args.threads, resume=args.resume, low_mem=args.low_mem)
+                                                                        threads=args.threads, use_existing_protein_file=use_existing_protein_file, 
+                                                                        resume_diamond=args.resume, low_mem=args.low_mem)
 
     # Use contig index instead of contig name to save memory
     contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)

diff --git a/docs/usage.md b/docs/usage.md
@@ -58,6 +58,20 @@ For example, consider the following two `contig2bin_tables`:
 
 In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`.
 
+
+## Providing Precomputed Protein Sequences
+
+You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its contig.  
+
+By using this option, the gene prediction step is skipped.  
+
+### Example  
+If your contig is named `contig_A`, the gene identifiers should follow this pattern:  
+- `contig_A_1`  
+- `contig_A_2`  
+- `contig_A_3`  
+
+
 ## Outputs
 
 Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option.

diff --git a/tests/cds_test.py b/tests/cds_test.py
@@ -103,9 +103,6 @@ def test_extract_contig_name_from_cds_name():
     assert result == "contig1"
 
 
-# Import the functions write_faa and parse_faa_file here
-
-
 def test_write_faa(contig1, orf_finder):
 
     predicted_genes = orf_finder.find_genes(contig1.seq)
@@ -122,10 +119,11 @@ def test_write_faa(contig1, orf_finder):
 
 
 def test_parse_faa_file(tmp_path):
-    # Mock a FASTA file
+    # Mock a FASTA file of protein sequences
+    # at least one protein sequence to not triger the error
     fasta_content = (
         ">contig1_gene1\n"
-        "AAAAAAAAAAA\n"
+        "MPPPAOSKNSKSS\n" 
         ">contig1_gene2\n"
         "CCCCCCCCCCC\n"
         ">contig2_gene1\n"
@@ -139,11 +137,32 @@ def test_parse_faa_file(tmp_path):
 
     # Check if the output matches the expected dictionary
     expected_result = {
-        'contig1': ['AAAAAAAAAAA', 'CCCCCCCCCCC'],
+        'contig1': ['MPPPAOSKNSKSS', 'CCCCCCCCCCC'],
         'contig2': ['TTTTTTTTTTTT']
     }
     assert result == expected_result
 
+
+def test_parse_faa_file_raises_error_for_dna(tmp_path):
+    # Mock a DNA FASTA file
+    fasta_content = (
+        ">contig1_gene1\n"
+        "AAAAAAAAAAA\n"
+        ">contig1_gene2\n"
+        "CCCCCCCCCCC\n"
+        ">contig2_gene1\n"
+        "TTTTTTTTTTTT\n"
+    )
+    fna_file = tmp_path / "mock_file.fna"
+    fna_file.write_text(fasta_content)
+
+
+    # Check that ValueError is raised when DNA sequences are encountered
+    with pytest.raises(ValueError):
+        cds.parse_faa_file(fna_file)
+
+
+
 def test_get_aa_composition():
 
     genes = ['AAAA',
@@ -175,4 +194,24 @@ def test_get_contig_cds_metadata():
 
     assert contig_metadata['contig_to_cds_count'] == {"c1":3, "c2":2}
     assert contig_metadata['contig_to_aa_counter'] == {"c1": {'A': 4, 'G': 4, "C":4} , "c2":{'C': 4, 'T': 4}}
-    assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
+    assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
+
+
+
+# Test function
+def test_is_nucleic_acid():
+    # Valid DNA sequence
+    assert cds.is_nucleic_acid("ATCG") is True
+    assert cds.is_nucleic_acid("ATCNNNNNG") is True # N can be found in DNA seq
+    # Valid RNA sequence
+    assert cds.is_nucleic_acid("AUGCAUGC") is True
+
+    # Mixed case
+    assert cds.is_nucleic_acid("AtCg") is True
+
+    # Invalid sequence (contains characters not part of DNA or RNA)
+    assert cds.is_nucleic_acid("ATCX") is False  # 'X' is not a valid base
+    assert cds.is_nucleic_acid("AUG#C") is False  # '#' is not a valid base
+
+    # Amino acid sequence
+    assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False  # Numbers are invalid