Skip to content

Commit

Permalink
Merge pull request #305 from labgem/dev
Browse files Browse the repository at this point in the history
 Merge dev branch into master to release version 2.2.1
  • Loading branch information
JeanMainguy authored Dec 3, 2024
2 parents 8cacb1b + f1e9a8a commit f49f497
Show file tree
Hide file tree
Showing 16 changed files with 201 additions and 83 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.0
2.2.1
1 change: 1 addition & 0 deletions docs/user/RGP/rgpOutputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ The file has the following format :
| stop | The stop position of the RGP in the contig. |
| length | The length of the RGP in nucleotide |
| coordinates | The coordinates of the region. If the region overlap the contig edges will be right with join coordinates syntax (*i.e* 1523..1758,1..57) |
| score | Score of the RGP. |
| contigBorder | This is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete. |
| wholeContig | This is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If a RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant. |

Expand Down
2 changes: 1 addition & 1 deletion ppanggolin/RGP/genomicIsland.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import argparse
from pathlib import Path
from typing import Set, Iterable, List, Tuple
from typing import Set, Iterable

# installed libraries
from tqdm import tqdm
Expand Down
55 changes: 31 additions & 24 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# local libraries
from ppanggolin.annotate.synta import (
annotate_organism,
read_fasta,
get_contigs_from_fasta_file,
get_dna_sequence,
init_contig_counter,
contig_counter,
Expand Down Expand Up @@ -1034,17 +1034,22 @@ def check_chevrons_in_start_and_stop(
dbxref_metadata
)

if fields_gff[gff_seqname] in circular_contigs or (
if (
"IS_CIRCULAR" in attributes
and attributes["IS_CIRCULAR"] == "true"
):
# WARNING: In case we have prodigal gff with is_circular attributes.
# This would fail as contig is not defined. However, is_circular should not be found in prodigal gff
logging.getLogger("PPanGGOLiN").debug(
f"Contig {contig.name} is circular."
)
contig.is_circular = True
assert contig.name == fields_gff[gff_seqname]
contig_name = fields_gff[gff_seqname]

if contig is not None:
logging.getLogger("PPanGGOLiN").debug(
f"Contig {contig.name} is circular."
)
contig.is_circular = True
assert contig.name == contig_name
else:
# contig object has not been initialized yet.
# let's keep the circularity info in the circular_contigs list
circular_contigs.append(contig_name)

elif fields_gff[gff_type] == "CDS" or "RNA" in fields_gff[gff_type]:

Expand Down Expand Up @@ -1120,9 +1125,9 @@ def check_chevrons_in_start_and_stop(
gene_frame = int(fields_gff[frame])

if (
gene_id in id_attr_to_gene_id
id_attribute in id_attr_to_gene_id
): # the ID has already been seen at least once in this genome
existing_gene = id_attr_to_gene_id[gene_id]
existing_gene = id_attr_to_gene_id[id_attribute]
new_gene_info = {
"strand": fields_gff[gff_strand],
"type": fields_gff[gff_type],
Expand All @@ -1141,7 +1146,7 @@ def check_chevrons_in_start_and_stop(

gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4))

id_attr_to_gene_id[gene_id] = gene
id_attr_to_gene_id[id_attribute] = gene

# here contig is filled in order, so position is the number of genes already stored in the contig.
gene.fill_annotations(
Expand Down Expand Up @@ -1182,9 +1187,6 @@ def check_chevrons_in_start_and_stop(
rna_counter += 1
contig.add_rna(rna)

# Correct coordinates of genes that overlap the edge of circulars contig
correct_putative_overlaps(org.contigs)

# Fix partial genes coordinates
for contig in org.contigs:
for gene in contig.genes:
Expand All @@ -1201,14 +1203,11 @@ def check_chevrons_in_start_and_stop(
has_fasta = False

if has_fasta:
contig_sequences = read_fasta(
org, fasta_string.split("\n")
) # _ is total contig length
contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n"))

correct_putative_overlaps(org.contigs)

for contig in org.contigs:
if contig.length != len(contig_sequences[contig.name]):
raise ValueError(
"The contig length defined is different than the sequence length"
)

for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
Expand All @@ -1220,7 +1219,7 @@ def check_chevrons_in_start_and_stop(
add_metadata_from_gff_file(contig_name_to_region_info, org, gff_file_path)

if used_transl_table_arg:
logging.getLogger("PPanGGOLiN").info(
logging.getLogger("PPanGGOLiN").debug(
f"transl_table tag was not found for {used_transl_table_arg} CDS "
f"in {gff_file_path}. Provided translation_table argument value was used instead: {translation_table}."
)
Expand Down Expand Up @@ -1616,7 +1615,15 @@ def get_gene_sequences_from_fastas(
f"your fasta file are different."
)
with read_compressed_or_not(Path(elements[1])) as currFastaFile:
fasta_dict[org] = read_fasta(org, currFastaFile)
fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile)

# When dealing with GFF files, some genes may have coordinates extending beyond the actual
# length of contigs, especially when they overlap the edges. This usually needs to be split
# into two parts to handle the circular genome wrapping.
# If the GFF file lacks associated FASTA sequences and it was not possible to determine the
# contig length from the GFF file, we must apply this correction while parsing the external FASTA file.

correct_putative_overlaps(org.contigs)

if set(pangenome.organisms) > set(fasta_dict.keys()):
missing = pangenome.number_of_organisms - len(
Expand Down
136 changes: 95 additions & 41 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from subprocess import Popen, PIPE
import ast
from collections import defaultdict
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Union, Generator, Tuple
from pathlib import Path

# install libraries
Expand Down Expand Up @@ -245,49 +245,103 @@ def launch_infernal(
return gene_objs


def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
"""Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
def check_sequence_tuple(name: str, sequence: str):
"""
Checks and validates a sequence name and its corresponding sequence.
:param name: The name (header) of the sequence, typically extracted from the FASTA file header.
:param sequence: The sequence string corresponding to the name, containing the nucleotide or protein sequence.
:param org: Organism corresponding to fasta file
:param fna_file: Input fasta file with sequences or list of each line as sequence
:return: A tuple containing the validated name and sequence.
:return: Dictionary with contig_name as keys and contig sequence in values
:raises ValueError:
- If the sequence is empty, a ValueError is raised with a message containing the header name.
- If the name is empty, a ValueError is raised with a message containing a preview of the sequence.
"""
global contig_counter
try:
contigs = {}
contig_seq = ""
contig = None
for line in fna_file:
if line.startswith(">"):
if len(contig_seq) >= 1: # contig filter = 1
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)
contig_seq = ""
try:
contig = org.get(line.split()[0][1:])
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, line.split()[0][1:])
contig_counter.value += 1
org.add(contig)
else:
contig_seq += line.strip()
if len(contig_seq) >= 1: # processing the last contig
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)

except AttributeError as e:
raise AttributeError(
f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
f"One possibility for this error is that the file did not start with a '>' "
f"as it would be expected from a fna file."
)
except Exception as err: # To manage other exception which can occur
raise Exception(
f"{err}: Please check your input file and if everything looks fine, "
"please post an issue on our github"
if not sequence:
raise ValueError(f"Found an empty sequence with header '{name}'")

if not name:
raise ValueError(
f"Found a sequence with empty name (sequence starts as '{sequence[:60]}')"
)

return name, sequence


def parse_fasta(
fna_file: Union[TextIOWrapper, list]
) -> Generator[Tuple[str, str], None, None]:
"""Yields each sequence name and sequence from a FASTA file or stream as a tuple.
:param fna_file: Input FASTA file or list of lines as sequences.
:yield: Tuple with contig header (without '>') and sequence.
:raises ValueError: If the file does not contain valid FASTA format.
"""
name = None
sequence = ""

for line in fna_file:
line = line.strip()

if line.startswith(">"): # New header
if name: # Yield previous header and sequence if available
yield check_sequence_tuple(name, sequence)

name = line[1:].split()[
0
] # Strip '>' and extract the first word as the name
sequence = ""

elif line: # Only append non-empty lines
sequence += line

else:
# You can skip or handle empty lines here if required
pass

# Yield the final contig if exists
if name:
yield check_sequence_tuple(name, sequence)

# Check if there was any valid data (at least one header and sequence)
if not name:
raise ValueError("The file does not contain any valid FASTA content.")


def get_contigs_from_fasta_file(
org: Organism, fna_file: Union[TextIOWrapper, list]
) -> Dict[str, str]:
"""Processes contigs from a parsed FASTA generator and stores in a dictionary.
:param org: Organism instance to update with contig info.
:param fna_file: Input FASTA file or list of lines as sequences.
:return: Dictionary with contig names as keys and sequences as values.
"""

global contig_counter
contigs = {}

for contig_name, sequence in parse_fasta(fna_file):

# Retrieve or create the contig
try:
contig = org.get(contig_name)
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, contig_name)
contig_counter.value += 1
org.add(contig)

# Update contig information
if contig.length is not None and contig.length != len(sequence):
raise ValueError(
f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence."
)

contig.length = len(sequence)
contigs[contig_name] = sequence.upper()

return contigs


Expand Down Expand Up @@ -464,7 +518,7 @@ def annotate_organism(

fasta_file = read_compressed_or_not(file_name)

contig_sequences = read_fasta(org, fasta_file)
contig_sequences = get_contigs_from_fasta_file(org, fasta_file)
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
if procedure is None: # prodigal procedure is not force by user
Expand Down
22 changes: 15 additions & 7 deletions ppanggolin/formats/readBinaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,36 +971,45 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
region = pangenome.get_region(row["RGP"].decode())
except KeyError:
region = Region(row["RGP"].decode())

# starting from v2.2.1 score is part of RGP table in h5.
if "score" in row.dtype.names:
region.score = row["score"]

pangenome.add_region(region)

gene = pangenome.get_gene(row["gene"].decode())
region.add(gene)
pangenome.status["predictedRGP"] = "Loaded"


def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
"""
Read hotspot in pangenome hdf5 file to add in pangenome object
Read hotspots in the pangenome HDF5 file and add them to the pangenome object.
:param pangenome: Pangenome object without spot
:param h5f: Pangenome HDF5 file with spot computed
:param disable_bar: Disable the progress bar
"""
table = h5f.root.spots
spots = {}
curr_spot_id = None
for row in tqdm(
read_chunks(table, chunk=20000),
total=table.nrows,
unit="spot",
disable=disable_bar,
):
curr_spot = spots.get(int(row["spot"]))
if curr_spot is None:
curr_spot = Spot(int(row["spot"]))
spots[row["spot"]] = curr_spot
if curr_spot_id != int(row["spot"]):
curr_spot_id = int(row["spot"])
curr_spot = spots.get(curr_spot_id)
if curr_spot is None:
curr_spot = Spot(int(row["spot"]))
spots[row["spot"]] = curr_spot
region = pangenome.get_region(row["RGP"].decode())
curr_spot.add(region)
curr_spot.spot_2_families()
for spot in spots.values():
spot.spot_2_families()
pangenome.add_spot(spot)
pangenome.status["spots"] = "Loaded"

Expand Down Expand Up @@ -1548,7 +1557,6 @@ def read_pangenome(
f"The pangenome in file '{filename}' does not have spots information, "
f"or has been improperly filled"
)

if modules:
if h5f.root.status._v_attrs.modules:
logging.getLogger("PPanGGOLiN").info("Reading the modules...")
Expand Down
2 changes: 2 additions & 0 deletions ppanggolin/formats/writeBinaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def rgp_desc(max_rgp_len, max_gene_len):
return {
"RGP": tables.StringCol(itemsize=max_rgp_len),
"gene": tables.StringCol(itemsize=max_gene_len),
"score": tables.UInt32Col(),
}


Expand Down Expand Up @@ -355,6 +356,7 @@ def write_rgp(
for gene in region.genes:
rgp_row["RGP"] = region.name
rgp_row["gene"] = gene.ID
rgp_row["score"] = region.score
rgp_row.append()
rgp_table.flush()

Expand Down
2 changes: 2 additions & 0 deletions ppanggolin/formats/writeFlatPangenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False):
"stop",
"length",
"coordinates",
"score",
"contigBorder",
"wholeContig",
]
Expand All @@ -1117,6 +1118,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False):
"stop": region.stop,
"length": region.length,
"coordinates": region.string_coordinates(),
"score": region.score,
"contigBorder": region.is_contig_border,
"wholeContig": region.is_whole_contig,
}
Expand Down
2 changes: 0 additions & 2 deletions ppanggolin/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,6 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene):
@property
def length(self) -> Union[int, None]:
"""Get the length of the contig"""
if self._length is None:
logging.getLogger("PPanGGOLiN").warning("Contig length is unknown")
return self._length

@length.setter
Expand Down
Loading

0 comments on commit f49f497

Please sign in to comment.