Skip to content

Commit

Permalink
Merge pull request #31 from genotoul-bioinfo/add_protein_input
Browse files Browse the repository at this point in the history
Provide Precomputed Protein Sequences
  • Loading branch information
JeanMainguy authored Nov 27, 2024
2 parents 44a4bac + a04a6c0 commit 6a9cf4a
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 41 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/binette_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,16 @@ jobs:
python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv
- name: Run simple test case from bin dirs and with proteins input
run: |
cd test_data
binette -d binning_results/A/ binning_results/B/ binning_results/C/ \
--contigs all_contigs.fna --checkm2_db checkm2_tiny_db/checkm2_tiny_db.dmnd -v -o test_results_from_dirs_and_prot_input --proteins proteins.faa
- name: Compare results from bin dirs with expectation
run: |
cd test_data
head expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
40 changes: 40 additions & 0 deletions binette/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,59 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
for contig_id, genes in contig_to_genes:
genes.write_translations(fl, contig_id)


def is_nucleic_acid(sequence: str) -> bool:
"""
Determines whether the given sequence is a DNA or RNA sequence.
:param sequence: The sequence to check.
:return: True if the sequence is a DNA or RNA sequence, False otherwise.
"""
# Define nucleotidic bases (DNA and RNA)
nucleotidic_bases = set('ATCGNUatcgnu')

# Check if all characters in the sequence are valid nucleotidic bases (DNA or RNA)
if all(base in nucleotidic_bases for base in sequence):
return True

# If any character is invalid, return False
return False



def parse_faa_file(faa_file: str) -> Dict[str, List[str]]:
"""
Parse a FASTA file containing protein sequences and organize them by contig.
:param faa_file: Path to the input FASTA file.
:return: A dictionary mapping contig names to lists of protein sequences.
:raises ValueError: If the file contains nucleotidic sequences instead of protein sequences.
"""
contig_to_genes = defaultdict(list)
checked_sequences = []

# Iterate through the FASTA file and parse sequences
for name, seq in pyfastx.Fastx(faa_file):
contig = get_contig_from_cds_name(name)
contig_to_genes[contig].append(seq)

# Concatenate up to the first 20 sequences for validation
if len(checked_sequences) < 20:
checked_sequences.append(seq)

# Concatenate all checked sequences for a more reliable nucleic acid check
concatenated_seq = "".join(checked_sequences)

# Check if the concatenated sequence appears to be nucleic acid
if is_nucleic_acid(concatenated_seq):
raise ValueError(
f"The file '{faa_file}' appears to contain nucleotide sequences. "
"Ensure that the file contains valid protein sequences in FASTA format."
)

return dict(contig_to_genes)



def get_aa_composition(genes: List[str]) -> Counter:
"""
Expand Down
4 changes: 2 additions & 2 deletions binette/io_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str],

issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly))

message = f"{issue_countigs} contigs found in file {elsewhere_file} \
were not found in assembly_file ({assembly_file})."
message = (f"{issue_countigs} contigs found in file '{elsewhere_file}' "
f"were not found in assembly_file '{assembly_file}'")
assert are_contigs_consistent, message


Expand Down
56 changes: 48 additions & 8 deletions binette/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ def __call__(
setattr(namespace, self.dest, values)


def is_valid_file(parser: ArgumentParser, arg: str) -> Path:
"""
Validates that the provided input file exists.
:param parser: The ArgumentParser instance handling command-line arguments.
:param arg: The path to the file provided as an argument.
:return: A Path object representing the valid file.
"""
path_arg = Path(arg)

# Check if the file exists at the provided path
if not path_arg.exists():
parser.error(f"Error: The specified file '{arg}' does not exist.")

return path_arg

def parse_arguments(args):
"""Parse script arguments."""
Expand All @@ -85,7 +100,7 @@ def parse_arguments(args):
"-d",
"--bin_dirs",
nargs="+",
type=Path,
type=lambda x: is_valid_file(parser, x),
action=UniqueStore,
help="List of bin folders containing each bin in a fasta file.",
)
Expand All @@ -95,12 +110,22 @@ def parse_arguments(args):
"--contig2bin_tables",
nargs="+",
action=UniqueStore,
type=Path,
type=lambda x: is_valid_file(parser, x),
help="List of contig2bin table with two columns separated\
with a tabulation: contig, bin",
)

input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.")
input_group.add_argument("-c", "--contigs", required=True,
type=lambda x: is_valid_file(parser, x),
help="Contigs in fasta format.")

input_group.add_argument(
"-p", "--proteins",
type=lambda x: is_valid_file(parser, x),
help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). "
"Skips the gene prediction step if provided."
)


# Other parameters category
other_group = parser.add_argument_group('Other Arguments')
Expand Down Expand Up @@ -211,7 +236,9 @@ def parse_input_files(bin_dirs: List[Path],

def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int],
contigs_in_bins: Set[str], diamond_result_file: Path,
checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool,
resume_diamond:bool,
low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
"""
Predicts or reuses proteins prediction and runs diamond on them.
Expand All @@ -222,14 +249,15 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
:param diamond_result_file: The path to the diamond result file.
:param checkm2_db: The path to the CheckM2 database.
:param threads: Number of threads for parallel processing.
:param resume: Boolean indicating whether to resume the process.
:param use_existing_protein_file: Boolean indicating whether to use an existing protein file.
:param resume_diamond: Boolean indicating whether to resume diamond alignement.
:param low_mem: Boolean indicating whether to use low memory mode.
:return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
"""

# Predict or reuse proteins prediction and run diamond on them
if resume:
if use_existing_protein_file:
logging.info(f"Parsing faa file: {faa_file}.")
contig_to_genes = cds.parse_faa_file(faa_file.as_posix())
io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix())
Expand All @@ -238,6 +266,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins)
contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)

if not resume_diamond:
if checkm2_db is None:
# get checkm2 db stored in checkm2 install
diamond_db_path = diamond.get_checkm2_db()
Expand Down Expand Up @@ -360,8 +389,17 @@ def main():
# Temporary files #
out_tmp_dir:Path = args.outdir / "temporary_files"
os.makedirs(out_tmp_dir, exist_ok=True)

use_existing_protein_file = False

if args.proteins:
logging.info(f"Using the provided protein sequences file: {args.proteins}")
faa_file = args.proteins
use_existing_protein_file = True
else:
faa_file = out_tmp_dir / "assembly_proteins.faa"


faa_file = out_tmp_dir / "assembly_proteins.faa"
diamond_result_file = out_tmp_dir / "diamond_result.tsv"

# Output files #
Expand All @@ -370,13 +408,15 @@ def main():

if args.resume:
io.check_resume_file(faa_file, diamond_result_file)
use_existing_protein_file = True

bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions))

contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
contigs_in_bins=contigs_in_bins,
diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
threads=args.threads, resume=args.resume, low_mem=args.low_mem)
threads=args.threads, use_existing_protein_file=use_existing_protein_file,
resume_diamond=args.resume, low_mem=args.low_mem)

# Use contig index instead of contig name to save memory
contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)
Expand Down
14 changes: 14 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,20 @@ For example, consider the following two `contig2bin_tables`:
In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`.
## Providing Precomputed Protein Sequences
You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its contig.
By using this option, the gene prediction step is skipped.
### Example
If your contig is named `contig_A`, the gene identifiers should follow this pattern:
- `contig_A_1`
- `contig_A_2`
- `contig_A_3`
## Outputs
Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option.
Expand Down
53 changes: 46 additions & 7 deletions tests/cds_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ def test_extract_contig_name_from_cds_name():
assert result == "contig1"


# Import the functions write_faa and parse_faa_file here


def test_write_faa(contig1, orf_finder):

predicted_genes = orf_finder.find_genes(contig1.seq)
Expand All @@ -122,10 +119,11 @@ def test_write_faa(contig1, orf_finder):


def test_parse_faa_file(tmp_path):
# Mock a FASTA file
# Mock a FASTA file of protein sequences
# at least one protein sequence to not triger the error
fasta_content = (
">contig1_gene1\n"
"AAAAAAAAAAA\n"
"MPPPAOSKNSKSS\n"
">contig1_gene2\n"
"CCCCCCCCCCC\n"
">contig2_gene1\n"
Expand All @@ -139,11 +137,32 @@ def test_parse_faa_file(tmp_path):

# Check if the output matches the expected dictionary
expected_result = {
'contig1': ['AAAAAAAAAAA', 'CCCCCCCCCCC'],
'contig1': ['MPPPAOSKNSKSS', 'CCCCCCCCCCC'],
'contig2': ['TTTTTTTTTTTT']
}
assert result == expected_result


def test_parse_faa_file_raises_error_for_dna(tmp_path):
# Mock a DNA FASTA file
fasta_content = (
">contig1_gene1\n"
"AAAAAAAAAAA\n"
">contig1_gene2\n"
"CCCCCCCCCCC\n"
">contig2_gene1\n"
"TTTTTTTTTTTT\n"
)
fna_file = tmp_path / "mock_file.fna"
fna_file.write_text(fasta_content)


# Check that ValueError is raised when DNA sequences are encountered
with pytest.raises(ValueError):
cds.parse_faa_file(fna_file)



def test_get_aa_composition():

genes = ['AAAA',
Expand Down Expand Up @@ -175,4 +194,24 @@ def test_get_contig_cds_metadata():

assert contig_metadata['contig_to_cds_count'] == {"c1":3, "c2":2}
assert contig_metadata['contig_to_aa_counter'] == {"c1": {'A': 4, 'G': 4, "C":4} , "c2":{'C': 4, 'T': 4}}
assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}



# Test function
def test_is_nucleic_acid():
# Valid DNA sequence
assert cds.is_nucleic_acid("ATCG") is True
assert cds.is_nucleic_acid("ATCNNNNNG") is True # N can be found in DNA seq
# Valid RNA sequence
assert cds.is_nucleic_acid("AUGCAUGC") is True

# Mixed case
assert cds.is_nucleic_acid("AtCg") is True

# Invalid sequence (contains characters not part of DNA or RNA)
assert cds.is_nucleic_acid("ATCX") is False # 'X' is not a valid base
assert cds.is_nucleic_acid("AUG#C") is False # '#' is not a valid base

# Amino acid sequence
assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False # Numbers are invalid
Loading

0 comments on commit 6a9cf4a

Please sign in to comment.