Skip to content

Commit f49f497

Browse files
authored
Merge pull request #305 from labgem/dev
Merge dev branch into master to release version 2.2.1
2 parents 8cacb1b + f1e9a8a commit f49f497

File tree

16 files changed

+201
-83
lines changed

16 files changed

+201
-83
lines changed

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.0
1+
2.2.1

docs/user/RGP/rgpOutputs.md

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ The file has the following format :
2525
| stop | The stop position of the RGP in the contig. |
2626
| length | The length of the RGP in nucleotide |
2727
| coordinates | The coordinates of the region. If the region overlap the contig edges will be right with join coordinates syntax (*i.e* 1523..1758,1..57) |
28+
| score | Score of the RGP. |
2829
| contigBorder | This is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete. |
2930
| wholeContig | This is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If a RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant. |
3031

ppanggolin/RGP/genomicIsland.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import logging
55
import argparse
66
from pathlib import Path
7-
from typing import Set, Iterable, List, Tuple
7+
from typing import Set, Iterable
88

99
# installed libraries
1010
from tqdm import tqdm

ppanggolin/annotate/annotate.py

+31-24
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
# local libraries
2323
from ppanggolin.annotate.synta import (
2424
annotate_organism,
25-
read_fasta,
25+
get_contigs_from_fasta_file,
2626
get_dna_sequence,
2727
init_contig_counter,
2828
contig_counter,
@@ -1034,17 +1034,22 @@ def check_chevrons_in_start_and_stop(
10341034
dbxref_metadata
10351035
)
10361036

1037-
if fields_gff[gff_seqname] in circular_contigs or (
1037+
if (
10381038
"IS_CIRCULAR" in attributes
10391039
and attributes["IS_CIRCULAR"] == "true"
10401040
):
1041-
# WARNING: In case we have prodigal gff with is_circular attributes.
1042-
# This would fail as contig is not defined. However, is_circular should not be found in prodigal gff
1043-
logging.getLogger("PPanGGOLiN").debug(
1044-
f"Contig {contig.name} is circular."
1045-
)
1046-
contig.is_circular = True
1047-
assert contig.name == fields_gff[gff_seqname]
1041+
contig_name = fields_gff[gff_seqname]
1042+
1043+
if contig is not None:
1044+
logging.getLogger("PPanGGOLiN").debug(
1045+
f"Contig {contig.name} is circular."
1046+
)
1047+
contig.is_circular = True
1048+
assert contig.name == contig_name
1049+
else:
1050+
# contig object has not been initialized yet.
1051+
# let's keep the circularity info in the circular_contigs list
1052+
circular_contigs.append(contig_name)
10481053

10491054
elif fields_gff[gff_type] == "CDS" or "RNA" in fields_gff[gff_type]:
10501055

@@ -1120,9 +1125,9 @@ def check_chevrons_in_start_and_stop(
11201125
gene_frame = int(fields_gff[frame])
11211126

11221127
if (
1123-
gene_id in id_attr_to_gene_id
1128+
id_attribute in id_attr_to_gene_id
11241129
): # the ID has already been seen at least once in this genome
1125-
existing_gene = id_attr_to_gene_id[gene_id]
1130+
existing_gene = id_attr_to_gene_id[id_attribute]
11261131
new_gene_info = {
11271132
"strand": fields_gff[gff_strand],
11281133
"type": fields_gff[gff_type],
@@ -1141,7 +1146,7 @@ def check_chevrons_in_start_and_stop(
11411146

11421147
gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4))
11431148

1144-
id_attr_to_gene_id[gene_id] = gene
1149+
id_attr_to_gene_id[id_attribute] = gene
11451150

11461151
# here contig is filled in order, so position is the number of genes already stored in the contig.
11471152
gene.fill_annotations(
@@ -1182,9 +1187,6 @@ def check_chevrons_in_start_and_stop(
11821187
rna_counter += 1
11831188
contig.add_rna(rna)
11841189

1185-
# Correct coordinates of genes that overlap the edge of circulars contig
1186-
correct_putative_overlaps(org.contigs)
1187-
11881190
# Fix partial genes coordinates
11891191
for contig in org.contigs:
11901192
for gene in contig.genes:
@@ -1201,14 +1203,11 @@ def check_chevrons_in_start_and_stop(
12011203
has_fasta = False
12021204

12031205
if has_fasta:
1204-
contig_sequences = read_fasta(
1205-
org, fasta_string.split("\n")
1206-
) # _ is total contig length
1206+
contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n"))
1207+
1208+
correct_putative_overlaps(org.contigs)
1209+
12071210
for contig in org.contigs:
1208-
if contig.length != len(contig_sequences[contig.name]):
1209-
raise ValueError(
1210-
"The contig length defined is different than the sequence length"
1211-
)
12121211

12131212
for gene in contig.genes:
12141213
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
@@ -1220,7 +1219,7 @@ def check_chevrons_in_start_and_stop(
12201219
add_metadata_from_gff_file(contig_name_to_region_info, org, gff_file_path)
12211220

12221221
if used_transl_table_arg:
1223-
logging.getLogger("PPanGGOLiN").info(
1222+
logging.getLogger("PPanGGOLiN").debug(
12241223
f"transl_table tag was not found for {used_transl_table_arg} CDS "
12251224
f"in {gff_file_path}. Provided translation_table argument value was used instead: {translation_table}."
12261225
)
@@ -1616,7 +1615,15 @@ def get_gene_sequences_from_fastas(
16161615
f"your fasta file are different."
16171616
)
16181617
with read_compressed_or_not(Path(elements[1])) as currFastaFile:
1619-
fasta_dict[org] = read_fasta(org, currFastaFile)
1618+
fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile)
1619+
1620+
# When dealing with GFF files, some genes may have coordinates extending beyond the actual
1621+
# length of contigs, especially when they overlap the edges. This usually needs to be split
1622+
# into two parts to handle the circular genome wrapping.
1623+
# If the GFF file lacks associated FASTA sequences and it was not possible to determine the
1624+
# contig length from the GFF file, we must apply this correction while parsing the external FASTA file.
1625+
1626+
correct_putative_overlaps(org.contigs)
16201627

16211628
if set(pangenome.organisms) > set(fasta_dict.keys()):
16221629
missing = pangenome.number_of_organisms - len(

ppanggolin/annotate/synta.py

+95-41
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from subprocess import Popen, PIPE
1010
import ast
1111
from collections import defaultdict
12-
from typing import Dict, List, Optional, Union
12+
from typing import Dict, List, Optional, Union, Generator, Tuple
1313
from pathlib import Path
1414

1515
# install libraries
@@ -245,49 +245,103 @@ def launch_infernal(
245245
return gene_objs
246246

247247

248-
def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
249-
"""Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
248+
def check_sequence_tuple(name: str, sequence: str):
249+
"""
250+
Checks and validates a sequence name and its corresponding sequence.
251+
252+
:param name: The name (header) of the sequence, typically extracted from the FASTA file header.
253+
:param sequence: The sequence string corresponding to the name, containing the nucleotide or protein sequence.
250254
251-
:param org: Organism corresponding to fasta file
252-
:param fna_file: Input fasta file with sequences or list of each line as sequence
255+
:return: A tuple containing the validated name and sequence.
253256
254-
:return: Dictionary with contig_name as keys and contig sequence in values
257+
:raises ValueError:
258+
- If the sequence is empty, a ValueError is raised with a message containing the header name.
259+
- If the name is empty, a ValueError is raised with a message containing a preview of the sequence.
255260
"""
256-
global contig_counter
257-
try:
258-
contigs = {}
259-
contig_seq = ""
260-
contig = None
261-
for line in fna_file:
262-
if line.startswith(">"):
263-
if len(contig_seq) >= 1: # contig filter = 1
264-
contigs[contig.name] = contig_seq.upper()
265-
contig.length = len(contig_seq)
266-
contig_seq = ""
267-
try:
268-
contig = org.get(line.split()[0][1:])
269-
except KeyError:
270-
with contig_counter.get_lock():
271-
contig = Contig(contig_counter.value, line.split()[0][1:])
272-
contig_counter.value += 1
273-
org.add(contig)
274-
else:
275-
contig_seq += line.strip()
276-
if len(contig_seq) >= 1: # processing the last contig
277-
contigs[contig.name] = contig_seq.upper()
278-
contig.length = len(contig_seq)
279-
280-
except AttributeError as e:
281-
raise AttributeError(
282-
f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
283-
f"One possibility for this error is that the file did not start with a '>' "
284-
f"as it would be expected from a fna file."
285-
)
286-
except Exception as err: # To manage other exception which can occur
287-
raise Exception(
288-
f"{err}: Please check your input file and if everything looks fine, "
289-
"please post an issue on our github"
261+
if not sequence:
262+
raise ValueError(f"Found an empty sequence with header '{name}'")
263+
264+
if not name:
265+
raise ValueError(
266+
f"Found a sequence with empty name (sequence starts as '{sequence[:60]}')"
290267
)
268+
269+
return name, sequence
270+
271+
272+
def parse_fasta(
273+
fna_file: Union[TextIOWrapper, list]
274+
) -> Generator[Tuple[str, str], None, None]:
275+
"""Yields each sequence name and sequence from a FASTA file or stream as a tuple.
276+
277+
:param fna_file: Input FASTA file or list of lines as sequences.
278+
:yield: Tuple with contig header (without '>') and sequence.
279+
:raises ValueError: If the file does not contain valid FASTA format.
280+
"""
281+
name = None
282+
sequence = ""
283+
284+
for line in fna_file:
285+
line = line.strip()
286+
287+
if line.startswith(">"): # New header
288+
if name: # Yield previous header and sequence if available
289+
yield check_sequence_tuple(name, sequence)
290+
291+
name = line[1:].split()[
292+
0
293+
] # Strip '>' and extract the first word as the name
294+
sequence = ""
295+
296+
elif line: # Only append non-empty lines
297+
sequence += line
298+
299+
else:
300+
# You can skip or handle empty lines here if required
301+
pass
302+
303+
# Yield the final contig if exists
304+
if name:
305+
yield check_sequence_tuple(name, sequence)
306+
307+
# Check if there was any valid data (at least one header and sequence)
308+
if not name:
309+
raise ValueError("The file does not contain any valid FASTA content.")
310+
311+
312+
def get_contigs_from_fasta_file(
313+
org: Organism, fna_file: Union[TextIOWrapper, list]
314+
) -> Dict[str, str]:
315+
"""Processes contigs from a parsed FASTA generator and stores in a dictionary.
316+
317+
:param org: Organism instance to update with contig info.
318+
:param fna_file: Input FASTA file or list of lines as sequences.
319+
:return: Dictionary with contig names as keys and sequences as values.
320+
"""
321+
322+
global contig_counter
323+
contigs = {}
324+
325+
for contig_name, sequence in parse_fasta(fna_file):
326+
327+
# Retrieve or create the contig
328+
try:
329+
contig = org.get(contig_name)
330+
except KeyError:
331+
with contig_counter.get_lock():
332+
contig = Contig(contig_counter.value, contig_name)
333+
contig_counter.value += 1
334+
org.add(contig)
335+
336+
# Update contig information
337+
if contig.length is not None and contig.length != len(sequence):
338+
raise ValueError(
339+
f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence."
340+
)
341+
342+
contig.length = len(sequence)
343+
contigs[contig_name] = sequence.upper()
344+
291345
return contigs
292346

293347

@@ -464,7 +518,7 @@ def annotate_organism(
464518

465519
fasta_file = read_compressed_or_not(file_name)
466520

467-
contig_sequences = read_fasta(org, fasta_file)
521+
contig_sequences = get_contigs_from_fasta_file(org, fasta_file)
468522
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
469523
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
470524
if procedure is None: # prodigal procedure is not force by user

ppanggolin/formats/readBinaries.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -971,36 +971,45 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
971971
region = pangenome.get_region(row["RGP"].decode())
972972
except KeyError:
973973
region = Region(row["RGP"].decode())
974+
975+
# starting from v2.2.1 score is part of RGP table in h5.
976+
if "score" in row.dtype.names:
977+
region.score = row["score"]
978+
974979
pangenome.add_region(region)
980+
975981
gene = pangenome.get_gene(row["gene"].decode())
976982
region.add(gene)
977983
pangenome.status["predictedRGP"] = "Loaded"
978984

979985

980986
def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
981987
"""
982-
Read hotspot in pangenome hdf5 file to add in pangenome object
988+
Read hotspots in the pangenome HDF5 file and add them to the pangenome object.
983989
984990
:param pangenome: Pangenome object without spot
985991
:param h5f: Pangenome HDF5 file with spot computed
986992
:param disable_bar: Disable the progress bar
987993
"""
988994
table = h5f.root.spots
989995
spots = {}
996+
curr_spot_id = None
990997
for row in tqdm(
991998
read_chunks(table, chunk=20000),
992999
total=table.nrows,
9931000
unit="spot",
9941001
disable=disable_bar,
9951002
):
996-
curr_spot = spots.get(int(row["spot"]))
997-
if curr_spot is None:
998-
curr_spot = Spot(int(row["spot"]))
999-
spots[row["spot"]] = curr_spot
1003+
if curr_spot_id != int(row["spot"]):
1004+
curr_spot_id = int(row["spot"])
1005+
curr_spot = spots.get(curr_spot_id)
1006+
if curr_spot is None:
1007+
curr_spot = Spot(int(row["spot"]))
1008+
spots[row["spot"]] = curr_spot
10001009
region = pangenome.get_region(row["RGP"].decode())
10011010
curr_spot.add(region)
1002-
curr_spot.spot_2_families()
10031011
for spot in spots.values():
1012+
spot.spot_2_families()
10041013
pangenome.add_spot(spot)
10051014
pangenome.status["spots"] = "Loaded"
10061015

@@ -1548,7 +1557,6 @@ def read_pangenome(
15481557
f"The pangenome in file '{filename}' does not have spots information, "
15491558
f"or has been improperly filled"
15501559
)
1551-
15521560
if modules:
15531561
if h5f.root.status._v_attrs.modules:
15541562
logging.getLogger("PPanGGOLiN").info("Reading the modules...")

ppanggolin/formats/writeBinaries.py

+2
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def rgp_desc(max_rgp_len, max_gene_len):
299299
return {
300300
"RGP": tables.StringCol(itemsize=max_rgp_len),
301301
"gene": tables.StringCol(itemsize=max_gene_len),
302+
"score": tables.UInt32Col(),
302303
}
303304

304305

@@ -355,6 +356,7 @@ def write_rgp(
355356
for gene in region.genes:
356357
rgp_row["RGP"] = region.name
357358
rgp_row["gene"] = gene.ID
359+
rgp_row["score"] = region.score
358360
rgp_row.append()
359361
rgp_table.flush()
360362

ppanggolin/formats/writeFlatPangenome.py

+2
Original file line numberDiff line numberDiff line change
@@ -1096,6 +1096,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False):
10961096
"stop",
10971097
"length",
10981098
"coordinates",
1099+
"score",
10991100
"contigBorder",
11001101
"wholeContig",
11011102
]
@@ -1117,6 +1118,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False):
11171118
"stop": region.stop,
11181119
"length": region.length,
11191120
"coordinates": region.string_coordinates(),
1121+
"score": region.score,
11201122
"contigBorder": region.is_contig_border,
11211123
"wholeContig": region.is_whole_contig,
11221124
}

ppanggolin/genome.py

-2
Original file line numberDiff line numberDiff line change
@@ -553,8 +553,6 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene):
553553
@property
554554
def length(self) -> Union[int, None]:
555555
"""Get the length of the contig"""
556-
if self._length is None:
557-
logging.getLogger("PPanGGOLiN").warning("Contig length is unknown")
558556
return self._length
559557

560558
@length.setter

0 commit comments

Comments
 (0)