From 31d2fc252b80d49a5e1eea5a2ef2cbde8becce14 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 27 May 2024 10:55:35 -0500 Subject: [PATCH 1/3] updated get_format function --- locidex/extract.py | 7 ++----- locidex/search.py | 7 ++----- locidex/utils.py | 15 +++++++++++++-- tests/test_utils.py | 30 +++++++++++++++++++++++++++++- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/locidex/extract.py b/locidex/extract.py index 8f1283c..8d9bee9 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -18,7 +18,7 @@ from locidex.constants import SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, BlastCommands, DBConfig, DB_EXPECTED_FILES, EXTRACT_MODES, raise_file_not_found_e from locidex.version import __version__ from locidex.classes.aligner import perform_alignment, aligner -from locidex.utils import check_db_groups +from locidex.utils import check_db_groups, get_format logger = logging.getLogger(__name__) logging.basicConfig(filemode=sys.stderr, level=logging.INFO) @@ -112,10 +112,7 @@ def run_extract(config): format = None if format is None: - for t in FILE_TYPES: - for ext in FILE_TYPES[t]: - if ext == input_fasta.suffix: - format = t + format = get_format(input_fasta) else: format = format.lower() diff --git a/locidex/search.py b/locidex/search.py index 2009626..4b3f7d2 100644 --- a/locidex/search.py +++ b/locidex/search.py @@ -16,7 +16,7 @@ from locidex.manifest import DBData from locidex.classes.seq_intake import seq_intake, seq_store, HitFilters from locidex.constants import BlastCommands, SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, DB_EXPECTED_FILES, OPTION_GROUPS, DBConfig -from locidex.utils import write_seq_dict, check_db_groups, slots +from locidex.utils import write_seq_dict, check_db_groups, slots, get_format from locidex.version import __version__ logger = logging.getLogger(__name__) @@ -151,10 +151,7 @@ def run_search(config): os.makedirs(outdir, 0o755) if format is None: - for t in FILE_TYPES: - for ext in FILE_TYPES[t]: - if query_file.suffix == ext: - format = t + format = get_format(query_file) else: format = format.lower() diff --git a/locidex/utils.py b/locidex/utils.py index bb50b60..a0b8cb6 100644 --- a/locidex/utils.py +++ b/locidex/utils.py @@ -6,8 +6,8 @@ from pathlib import Path from locidex.manifest import ManifestItem from Bio.Seq import Seq -from typing import Dict, FrozenSet -from locidex.constants import NT_SUB, PROTEIN_ALPHA, DNA_ALPHA, OPTION_GROUPS +from typing import Dict, FrozenSet, Optional +from locidex.constants import NT_SUB, PROTEIN_ALPHA, DNA_ALPHA, OPTION_GROUPS, FILE_TYPES import locidex.manifest as manifest def slots(annotations: Dict[str, object]) -> FrozenSet[str]: @@ -150,3 +150,14 @@ def filter_hsps_df(df): return +def get_format(file: Path) -> Optional[str]: + """ + Return file type based on suffix used + """ + format: str = None + file_ext = "".join(file.suffixes).lower() + for k, extensions in FILE_TYPES.items(): + if file_ext in extensions: + format = k + break + return format \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 81d97f7..5b05507 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,7 @@ import pytest from locidex import utils +from pathlib import Path from locidex import manifest from argparse import Namespace from collections import namedtuple @@ -26,4 +27,31 @@ def test_check_db_groups_fail(): analysis_params = {"db_group": "Db1", "db_name": "test_name"} with pytest.raises(KeyError): - analysis_params = utils.check_db_groups(analysis_params, nm_group) \ No newline at end of file + analysis_params = utils.check_db_groups(analysis_params, nm_group) + + +@pytest.mark.parametrize( "file_in,type", + [ + ("test.fa", "fasta"), + ("test.fas", "fasta"), + ("test.ffn", "fasta"), + ("test.fna", "fasta"), + ("test.fasta.gz", "fasta"), + ("test.fas.gz", "fasta"), + ("test.fa.gz", "fasta"), + ("test.fna.gz", "fasta"), + ("test.ffn.gz", "fasta"), + ("test.gbk", "genbank"), + ("test.genbank", "genbank"), + ("test.gbf", "genbank"), + ("test.gbk.gz", "genbank"), + ("test.genbank.gz", "genbank"), + ("test.gbf.gz", "genbank"), + ("test.gbff.gz", "genbank"), + ("test.gbff", "genbank"), + ]) +def test_get_format(file_in, type): + """ + test get_format function + """ + assert utils.get_format(Path(file_in)) == type \ No newline at end of file From 60b7107ebcda6ea157a63f1be7da34f57f69865e Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 27 May 2024 11:11:55 -0500 Subject: [PATCH 2/3] updated utils function --- locidex/constants.py | 4 ++-- locidex/utils.py | 9 +++++---- tests/test_utils.py | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/locidex/constants.py b/locidex/constants.py index d7e543d..1724e10 100644 --- a/locidex/constants.py +++ b/locidex/constants.py @@ -64,8 +64,8 @@ def _keys(cls) -> list: return [i.name for i in fields(cls)] FILE_TYPES = { - 'genbank': [".gbk",".genbank",".gbf",".gbk.gz",".genbank.gz",".gbf.gz",".gbff",".gbff.gz"], - 'fasta': [".fasta",".fas",".fa",".ffn",".fna",".fasta.gz",".fas.gz",".fa.gz",".ffn.gz",".fna.gz"], + 'genbank': frozenset([".gbk",".genbank",".gbf",".gbk.gz",".genbank.gz",".gbf.gz",".gbff",".gbff.gz"]), + 'fasta': frozenset([".fasta",".fas",".fa",".ffn",".fna",".fasta.gz",".fas.gz",".fa.gz",".ffn.gz",".fna.gz"]), } diff --git a/locidex/utils.py b/locidex/utils.py index a0b8cb6..23105e8 100644 --- a/locidex/utils.py +++ b/locidex/utils.py @@ -155,9 +155,10 @@ def get_format(file: Path) -> Optional[str]: Return file type based on suffix used """ format: str = None - file_ext = "".join(file.suffixes).lower() + file_exts = file.suffixes for k, extensions in FILE_TYPES.items(): - if file_ext in extensions: - format = k - break + for ext in file_exts: + if ext in extensions: + format = k + break return format \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 5b05507..4c8f5b2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -49,6 +49,7 @@ def test_check_db_groups_fail(): ("test.gbf.gz", "genbank"), ("test.gbff.gz", "genbank"), ("test.gbff", "genbank"), + ("test.1.fasta.gz", "fasta"), ]) def test_get_format(file_in, type): """ From 2b429f851d6f2689ec082f53c28d741e38424fb6 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 27 May 2024 11:17:30 -0500 Subject: [PATCH 3/3] updated format report args --- locidex/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locidex/format.py b/locidex/format.py index 3b412a6..ca92190 100644 --- a/locidex/format.py +++ b/locidex/format.py @@ -238,7 +238,7 @@ def run(cmd_args=None): logger.info("Beginning format operation.") obj = locidex_format(input=input,header=LocidexDBHeader._fields,is_protein=is_coding,min_len_frac=min_len_frac,max_len_frac=max_len_frac, min_ident_perc=min_ident, - min_cov_perc=min_match_cov,trans_table=trans_table,valid_ext=FILE_TYPES['fasta']) + min_cov_perc=min_match_cov,trans_table=trans_table,valid_ext=list(FILE_TYPES['fasta'])) logger.info("Finished format.") run_data['result_file'] = os.path.join(outdir,"locidex.txt") pd.DataFrame.from_dict(obj.data,orient='index').to_csv(run_data['result_file'],sep="\t",index=False,header=True)