From bea3834a2bd1381e5043958ca44e6cd79ae99547 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Thu, 16 May 2024 16:57:32 -0500 Subject: [PATCH 1/7] updated logging and error handlers --- locidex/manifest.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/locidex/manifest.py b/locidex/manifest.py index 69440fa..a786e1c 100644 --- a/locidex/manifest.py +++ b/locidex/manifest.py @@ -9,8 +9,11 @@ from datetime import datetime from locidex.version import __version__ from locidex.constants import DBConfig, DBFiles +import logging +import errno - +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) class DBData: """ @@ -54,13 +57,15 @@ def nucleotide_name(cls): @property def nucleotide_blast_db(self): if self.nucleotide is None: - raise ValueError("Nucleotide blast database does not exist") + logger.critical("Nucleotide blast database does not exist.") + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self.nucleotide)) return self.nucleotide / self.__nucleotide_path @property def protein_blast_db(self): if self.protein is None: - raise ValueError("Protein blast database does not exist") + logger.critical("Protein blast database does not exist.") + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self.protein)) return self.protein / self.__protein_path def _get_config(self, db_dir: pathlib.Path) -> DBConfig: @@ -72,7 +77,8 @@ def _get_config(self, db_dir: pathlib.Path) -> DBConfig: def _get_metadata(self, db_dir: pathlib.Path) -> dict: metadata_file = db_dir.joinpath(DBFiles.meta_file) if not metadata_file.exists(): - raise FileNotFoundError("Metadata file does not exist. Database path maybe incorrect: {}".format(db_dir)) + logger.critical("Metadata file does not appear to exist in db: {}".format(db_dir)) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(metadata_file)) md_data = None with open(metadata_file, 'r') as md: md_data = json.load(md) @@ -84,15 +90,18 @@ def _get_blast_dbs(self, db_dir: pathlib.Path, config_data: DBConfig) -> Tuple[O nucleotide: Optional[pathlib.Path] = None protein: Optional[pathlib.Path] = None if not blast_db.exists(): - raise OSError("blast directory not found. Database path maybe incorrect: {}".format(db_dir)) + logger.critical("blast directory not found. Database path maybe incorrect: {}".format(db_dir)) + raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(db_dir)) if config_data.is_nucl: nucleotide = blast_db.joinpath(self.__nucleotide_path) if not nucleotide.exists(): - raise FileNotFoundError("Cannot find nucleotide database, but it should exist. {}".format(nucleotide)) + logger.critical("Cannot find nucleotide database, but it should exist. {}".format(nucleotide)) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(nucleotide)) if config_data.is_prot: protein = blast_db.joinpath(self.__protein_path) if not protein.exists(): - raise FileNotFoundError("Cannot find protein database, but it should exist. {}".format(protein)) + logger.critical("Cannot find protein database, but it should exist. {}".format(protein)) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(protein)) return nucleotide, protein @@ -146,7 +155,7 @@ def add_args(parser=None): if parser is None: parser = ArgumentParser( description="Locidex manifest: Setup directory of databases for use with search") - parser.add_argument('-i','--input', type=str, required=True,help='Input directory containing multiplie locidex databases') + parser.add_argument('-i','--input', type=str, required=True,help='Input directory containing multiple locidex databases') parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__) return parser @@ -160,7 +169,9 @@ def check_config(directory: pathlib.Path) -> DBConfig: """ config_dir = pathlib.Path(directory).joinpath(DBFiles.config_file) - config_data: Optional[DBConfig] = None + config_data: Optional[DBConfig] = None + if not config_dir.exists(): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(config_dir)) with open(config_dir, 'r') as conf: config_data = DBConfig(**json.load(conf)) for k, v in config_data.to_dict().items(): From d2cbea8d341bb812a73c708e53e846c2d4727c78 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 17 May 2024 11:19:37 -0500 Subject: [PATCH 2/7] updated logging and error handling Added support for traceback to be dumped to a file on failure --- locidex/build.py | 12 ++++++------ locidex/extract.py | 24 +++++++++++++++--------- locidex/fetch.py | 0 locidex/format.py | 20 +++++++++++++------- locidex/main.py | 16 +++++++++++++++- locidex/manifest.py | 24 ++++++++++++++++++++---- locidex/merge.py | 17 +++++++++-------- locidex/report.py | 25 ++++++++++++++++--------- locidex/search.py | 21 +++++++++++---------- 9 files changed, 105 insertions(+), 54 deletions(-) delete mode 100644 locidex/fetch.py diff --git a/locidex/build.py b/locidex/build.py index df591cf..b996147 100644 --- a/locidex/build.py +++ b/locidex/build.py @@ -11,7 +11,7 @@ from locidex.classes.blast import BlastMakeDB from locidex.manifest import DBData import getpass - +import errno import logging import sys @@ -71,7 +71,7 @@ def init_dir(self, d: Path): d.mkdir(parents=True, exist_ok=self.force, mode=0o755) except FileExistsError: logger.critical("Database file {} already exists. To overwrite please run with --force".format(d)) - sys.exit(17) + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(d)) return True @@ -163,14 +163,14 @@ def run(cmd_args=None): ) if not os.path.isfile(input_file): - print(f'Error {input_file} does not exist, please check path and try again') - sys.exit() + logger.critical(f'Error {input_file} does not exist, please check path and try again') + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input_file)) obj = locidex_build(Path(input_file), Path(outdir),config=config,seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=force) if obj.status == False: - print(f'Error something went wrong building the db, check error messages {obj.messages}') - sys.exit() + logger.critical(f'Error something went wrong building the db.') + raise ValueError("Something went wrong building db.") run_data['analysis_end_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(os.path.join(outdir,DBFiles.config_file),"w") as oh: diff --git a/locidex/extract.py b/locidex/extract.py index 2edc862..ad89af3 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -8,8 +8,9 @@ from datetime import datetime import numpy as np import pandas as pd +import logging +import errno from locidex.classes.extractor import extractor - from locidex.classes.blast import BlastSearch, FilterOptions, BlastMakeDB from locidex.manifest import DBData from locidex.classes.db import search_db_conf, db_config @@ -19,6 +20,9 @@ from locidex.classes.aligner import perform_alignment, aligner from locidex.utils import check_db_groups +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.INFO) + def add_args(parser=None): if parser is None: parser = ArgumentParser( @@ -117,25 +121,26 @@ def run_extract(config): if format is None or format not in FILE_TYPES: if format is None: - print(f'Could not guess format for {input_fasta}') + logger.critical(f'Could not guess format for {input_fasta}') + raise ValueError("Could not guess input type for: {}".format(str(input_fasta))) else: - print(f'Format for query file must be one of {list(FILE_TYPES.keys())}, you supplied {format}') - sys.exit() + logger.critical(f'Format for query file must be one of {list(FILE_TYPES.keys())}, you supplied {format}') + raise ValueError(f'Format for query file must be one of {list(FILE_TYPES.keys())}, you supplied {format}') seq_obj = seq_intake(input_fasta, format, 'source', translation_table, perform_annotation=False,skip_trans=True) # Validate database is valid db_database_config = search_db_conf(db_dir, DB_EXPECTED_FILES, DBConfig._keys()) if db_database_config.status == False: - print(f'There is an issue with provided db directory: {db_dir}\n {db_database_config.messages}') - sys.exit() + logger.critical(f'There is an issue with provided db directory: {db_dir}\n {db_database_config.messages}') + raise ValueError("There is an issue with the provided database: {}".format(db_dir)) metadata_path = db_database_config.meta_file_path metadata_obj = db_config(metadata_path, ['meta', 'info']) if os.path.isdir(outdir) and not force: - print(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit() + logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(outdir)) db_path = os.path.join(outdir, 'blast_db') @@ -176,7 +181,8 @@ def run_extract(config): #nt_db = Path("{}.fasta".format(blast_database_paths['nucleotide'])) nt_db = Path("{}.fasta".format(db_data.nucleotide_blast_db)) if not nt_db.exists(): - raise FileNotFoundError("Could not find nucleotide database: {}".format(nt_db)) + logger.critical("Could not find file: {}".format(nt_db)) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(nt_db)) filter_options = { 'evalue': FilterOptions(min=None, max=min_evalue, include=None), diff --git a/locidex/fetch.py b/locidex/fetch.py deleted file mode 100644 index e69de29..0000000 diff --git a/locidex/format.py b/locidex/format.py index 6523fa7..d2395fa 100644 --- a/locidex/format.py +++ b/locidex/format.py @@ -9,7 +9,8 @@ from mimetypes import guess_type from dataclasses import dataclass from typing import List, Tuple - +import logging +import errno import pandas as pd from Bio import SeqIO from Bio.Seq import Seq @@ -19,6 +20,9 @@ from locidex.utils import six_frame_translation, revcomp, calc_md5 from locidex.version import __version__ +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.INFO) + class locidex_format: input_type = None @@ -80,6 +84,7 @@ def set_input_type(self): elif os.path.isdir(self.input): self.input_type = self.__dir_input else: + logger.critical("Could not determine input type for: {}".format(self.input)) raise AttributeError("Unknown input type could not be determined for: {}".format(self.input)) def get_dir_files(self, input_dir): @@ -220,19 +225,20 @@ def run(cmd_args=None): run_data['parameters'] = vars(cmd_args) if os.path.isdir(outdir) and not force: - print(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit() + logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(outdir)) + if not os.path.isdir(outdir): os.makedirs(outdir, 0o755) if not os.path.isdir(input) and not os.path.isfile(input): - print(f'Error {input} does not exist as a file or directory') - sys.exit() - + logger.critical(f'Error {input} does not exist as a file or directory') + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input)) + logger.info("Beginning format operation.") obj = locidex_format(input=input,header=LocidexDBHeader._fields,is_protein=is_coding,min_len_frac=min_len_frac,max_len_frac=max_len_frac, min_ident_perc=min_ident, min_cov_perc=min_match_cov,trans_table=trans_table,valid_ext=FILE_TYPES['fasta']) - + logger.info("Finished format.") run_data['result_file'] = os.path.join(outdir,"locidex.txt") pd.DataFrame.from_dict(obj.data,orient='index').to_csv(run_data['result_file'],sep="\t",index=False,header=True) diff --git a/locidex/main.py b/locidex/main.py index 487b19e..8e9761d 100644 --- a/locidex/main.py +++ b/locidex/main.py @@ -2,8 +2,13 @@ import sys import argparse +import traceback +import logging from . import format, extract, report, merge, search, build, manifest +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) + tasks = { 'search': (search, 'Query set of Loci/Genes against a database to produce a sequence store for downstream processing'), 'extract': (extract, 'Extract loci from a genome based on a locidex database'), @@ -33,4 +38,13 @@ def main(argv=None): # call main function if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + error_file = "error.txt" + try: + main() + except Exception as e: + with open(error_file, "w") as f: + traceback.TracebackException.from_exception(e).print(file=f) + logger.critical("Program exited with errors, please review logs. For the full traceback please see file: {}".format(error_file)) + sys.exit(-1) + else: + sys.exit("Program finished without errors.") \ No newline at end of file diff --git a/locidex/manifest.py b/locidex/manifest.py index a786e1c..1f8d0ea 100644 --- a/locidex/manifest.py +++ b/locidex/manifest.py @@ -90,7 +90,7 @@ def _get_blast_dbs(self, db_dir: pathlib.Path, config_data: DBConfig) -> Tuple[O nucleotide: Optional[pathlib.Path] = None protein: Optional[pathlib.Path] = None if not blast_db.exists(): - logger.critical("blast directory not found. Database path maybe incorrect: {}".format(db_dir)) + logger.critical("blast directory not found. Database path maybe incorrect: {}".format(str(db_dir))) raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(db_dir)) if config_data.is_nucl: nucleotide = blast_db.joinpath(self.__nucleotide_path) @@ -176,6 +176,7 @@ def check_config(directory: pathlib.Path) -> DBConfig: config_data = DBConfig(**json.load(conf)) for k, v in config_data.to_dict().items(): if v is None or v == '': + logger.critical("Missing value in config file for key {} which has value {}".format(k, v)) raise AttributeError("Config cannot have missing values: {}".format(k)) return config_data @@ -191,7 +192,9 @@ def validate_db_files(allele_dir: List[pathlib.Path], file_in: pathlib.Path) -> for a_dir in allele_dir: for k, v in DBFiles.items(): if not pathlib.Path(a_dir / v).exists(): - raise FileNotFoundError("Required file {} does not exist.".format(k)) + logger.critical("Required file {} does not exist.".format(k)) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(a_dir / v)) + db_configs.append((a_dir.relative_to(file_in), check_config(a_dir))) return db_configs @@ -202,7 +205,11 @@ def check_dbs(file_in: pathlib.Path) -> List[pathlib.PosixPath]: file_in: A path to a directory of databases """ + logger.debug("Checking that the following databases exist in: {}".format(file_in)) db_dirs = [p for p in file_in.iterdir() if p.is_dir()] + if not db_dirs: + logger.critical("No valid databases found in: {}".format(file_in)) + raise AssertionError("No valid databases found in: {}".format(file_in)) return db_dirs def create_manifest(file_in: pathlib.Path) -> Dict[str, List[Dict[str, str]]]: @@ -215,12 +222,13 @@ def create_manifest(file_in: pathlib.Path) -> Dict[str, List[Dict[str, str]]]: validated_dbs: List[Tuple[pathlib.Path, DBConfig]] = validate_db_files(allele_dirs, file_in) db_manifest = dict() for path, conf in validated_dbs: - + logger.info("Adding database: {} to manifest.".format(str(path))) if db_manifest.get(conf.db_name) is None: db_manifest[conf.db_name] = [] if db_manifest[conf.db_name] and (versions := [i.db_version for i in db_manifest[conf.db_name]]): if conf.db_version in versions: + logger.critical("Databases with the same name and version have been specified (name: {}, path: {}, version: {})".format(conf.db_name, path, conf.db_version)) raise KeyError("Databases with the same name and version have been specified (name: {}, path: {}, version: {})".format(conf.db_name, path, conf.db_version)) db_manifest[conf.db_name].append(ManifestItem(db=path, config=conf, root_db=file_in).to_dict()) @@ -236,7 +244,9 @@ def write_manifest(file_in: pathlib.Path, manifest: Dict[str, List[Dict[str, str """ manifest_file = _Constants.manifest_name + logger.debug("Creating manifest file in directory: {} with name: {}".format(file_in, manifest_file)) path_out = file_in.joinpath(manifest_file) + logger.info("Writing manifest file to: {}".format(str(path_out))) with open(path_out, 'w', encoding='utf8') as m_out: json.dump(manifest, m_out, indent=2) return path_out @@ -247,7 +257,10 @@ def run(cmd_args=None): parser = add_args() cmd_args = parser.parse_args() directory_in = pathlib.Path(cmd_args.input) - directory_in.exists() + if not directory_in.exists(): + logger.critical("Directory: {} does not appear to exist.".format(str(directory_in))) + raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(directory_in)) + manifest = create_manifest(directory_in) return write_manifest(directory_in, manifest) @@ -261,11 +274,13 @@ def select_db(manifest_data: Dict[str, List[ManifestItem]], name: str, version: """ db_data = manifest_data.get(name) if db_data is None: + logger.critical("Could not find database with specified name: {}".format(name)) raise KeyError("Could not find database with specified name: {}".format(name)) try: db = next(filter(lambda x: x.config.db_version == version, db_data)) except StopIteration: + logger.critical("No database entry with version: {}".format(version)) raise ValueError("No database entry with version: {}".format(version)) return db @@ -275,6 +290,7 @@ def read_manifest(input_file: pathlib.Path) -> dict: input_file Path: Manifest file to be parsed """ if not input_file.is_dir(): + logger.critical("Please pass the database directory, not a file.") raise AssertionError("Allele database directory must be passed directly.") manifest_file = input_file / _Constants.manifest_name diff --git a/locidex/merge.py b/locidex/merge.py index 526de67..c4f9182 100644 --- a/locidex/merge.py +++ b/locidex/merge.py @@ -52,7 +52,7 @@ def get_file_list(input_files): else: if not os.path.isfile(input_files[0]): logger.critical(f'Error the supplied file {input_files[0]} does not exist') - sys.exit(errno.ENOENT) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input_files[0])) encoding = guess_type(input_files[0])[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open with _open(input_files[0]) as f: @@ -60,7 +60,7 @@ def get_file_list(input_files): line = line.rstrip() if not os.path.isfile(line): logger.critical(f'Error the supplied file {line} does not exist') - sys.exit(errno.ENOENT) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(line)) file_list.append(line) return file_list @@ -73,16 +73,17 @@ def validate_input_file(data_in: dict, db_version: str, db_name: str, perform_va sq_data = ReportData.deseriealize(data_in) except KeyError: logger.critical("Missing fields in configuration required fields in in reported allele file. Fields required: {}".format(ReportData.fields())) - sys.exit() + raise ValueError("Missing fields in configuration required fields in in reported allele file. Fields required: {}".format(ReportData.fields())) + else: if db_version is not None and sq_data.db_info.db_version != db_version and perform_validation: logger.critical("You are attempting to merge files that were created using different database versions.") - sys.exit() + raise ValueError("You are attempting to merge files that were created using different database versions.") if db_name is not None and sq_data.db_info.db_name != db_name and perform_validation: logger.critical("You are attempting to merge files that have different names.") - sys.exit() + raise ValueError("You are attempting to merge files that have different names. {} {}".format(sq_data.db_info.db_name, db_name)) return sq_data, sq_data.db_info.db_version, sq_data.db_info.db_name @@ -93,7 +94,7 @@ def check_files_exist(file_list: list[os.PathLike]) -> None: for file in file_list: if not os.path.isfile(file): logger.critical(f"Error cannot open input file {file}") - sys.exit(errno.ENOENT) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(file)) def read_file_list(file_list,perform_validation=False): @@ -118,7 +119,7 @@ def read_file_list(file_list,perform_validation=False): records[sample_name] = sq_data else: logger.critical("Duplicate sample name detected: {}".format(sq_data.data.sample_name)) - sys.exit("Attempting to merge allele profiles with the same sample name: {}".format(sq_data.data.sample_name)) + raise ValueError("Attempting to merge allele profiles with the same sample name: {}".format(sq_data.data.sample_name)) return records def extract_profiles(records): @@ -191,7 +192,7 @@ def run_merge(config): if os.path.isdir(outdir) and not force: logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit(errno.EEXIST) + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(outdir)) if not os.path.isdir(outdir): os.makedirs(outdir, 0o755) diff --git a/locidex/report.py b/locidex/report.py index 5be3f32..35dec76 100644 --- a/locidex/report.py +++ b/locidex/report.py @@ -6,6 +6,8 @@ from datetime import datetime from dataclasses import dataclass, asdict, fields import pandas as pd +import logging +import errno from typing import Any from locidex.classes.seq_intake import seq_intake from locidex.constants import SEARCH_RUN_DATA, START_CODONS, STOP_CODONS, DBConfig @@ -14,6 +16,10 @@ +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) + + @dataclass class Parameters: mode: str @@ -386,8 +392,8 @@ def run_report(config): run_data['parameters'] = analysis_parameters if os.path.isdir(outdir) and not force: - print(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit() + logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(outdir)) if not os.path.isdir(outdir): os.makedirs(outdir, 0o755) @@ -397,7 +403,8 @@ def run_report(config): seq_store_dict = json.load(fh) if len(seq_store_dict) == 0: - sys.exit() + logger.critical("seq_store from file: {} is empty".format(input_file)) + raise ValueError("seq_store from file: {} is empty".format(input_file)) if sample_name is None: sample_name = seq_store_dict["query_data"]["sample_name"] @@ -408,20 +415,20 @@ def run_report(config): seq_info = seq_store_dict["query_data"]["query_seq_data"] seq_obj = seq_intake(fasta_file, 'fasta', 'CDS', translation_table, perform_annotation=False) if len(seq_info) != len(seq_obj.seq_data): - print(f'Error the supplied fasta file: {fasta_file} ({len(seq_obj.seq_data)}) seq_store file: {input_file} ({len(seq_info)}) \ + logger.critical(f'Error the supplied fasta file: {fasta_file} ({len(seq_obj.seq_data)}) seq_store file: {input_file} ({len(seq_info)}) \ do not have the same number of sequences. These files must be matched') - sys.exit() + raise ValueError(f"Supplied fasta and seq_store have different numbers of sequences: {str(fasta_file)}, {str(input_file)}") for i in range(0,len(seq_obj.seq_data)): id = str(i) if id not in seq_info: - print(f'Error {id} key from fasta file not in seq_store') - sys.exit() + logger.critical(f'Error {id} key from fasta file not in seq_store') + raise KeyError(f'Error {id} key from fasta file not in seq_store') pid_1 = seq_info[id]["seq_id"] pid_2 = seq_obj.seq_data[i]["seq_id"] if pid_1 != pid_2: - print(f'Error seq_store key for {id}: {pid_1} mismatched to input fasta {id}: {pid_2}. These files must be matched') - sys.exit() + logger.critical(f'Error seq_store key for {id}: {pid_1} mismatched to input fasta {id}: {pid_2}. These files must be matched') + raise KeyError(f'Error seq_store key for {id}: {pid_1} mismatched to input fasta {id}: {pid_2}. These files must be matched') seq_data[id] = seq_obj.seq_data[i] allele_obj = seq_reporter(seq_store_dict, method='nucleotide', mode=mode, label=label, filters={},max_ambig=max_ambig,max_int_stop=max_int_stop,match_ident=match_ident) diff --git a/locidex/search.py b/locidex/search.py index 2b4bd0b..802e7ff 100644 --- a/locidex/search.py +++ b/locidex/search.py @@ -9,7 +9,8 @@ from dataclasses import dataclass import pandas as pd from functools import partial - +import logging +import errno from locidex.classes.blast import BlastSearch, FilterOptions from locidex.classes.db import search_db_conf, db_config from locidex.manifest import DBData @@ -18,6 +19,9 @@ from locidex.utils import write_seq_dict, check_db_groups, slots from locidex.version import __version__ +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) + def add_args(parser=None): if parser is None: parser = ArgumentParser( @@ -65,10 +69,6 @@ def add_args(parser=None): return parser -#def perform_search(query_file,results_file,db_path,blast_prog,blast_params,columns): -# return blast_search(db_path,query_file,results_file,blast_params,blast_prog,columns) - - def create_fasta_from_df(df,label_col,seq_col,out_file): return write_seq_dict(dict(zip(df[label_col].tolist(), df[seq_col])), out_file) @@ -144,8 +144,8 @@ def run_search(config): db_data = DBData(db_dir=db_dir) if os.path.isdir(outdir) and not force: - print(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit() + logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') + raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), str(outdir)) if not os.path.isdir(outdir): os.makedirs(outdir, 0o755) @@ -160,10 +160,11 @@ def run_search(config): if format is None or format not in FILE_TYPES: if format is None: - print(f'Could not guess format for {query_file}') + logger.critical(f'Could not guess format for {query_file}') + raise ValueError("Could not determine input type for: {}".format(query_file)) else: - print(f'Format for query file must be one of {list(FILE_TYPES.keys())}, you supplied {format}') - sys.exit() + logger.critical(f'Format for query file must be one of {list(FILE_TYPES.keys())}, you supplied {format}') + raise ValueError("Format supplied for {} is incorrect.".format(format)) seq_obj = seq_intake(input_file=query_file, file_type=format, feat_key='CDS', translation_table=translation_table, perform_annotation=perform_annotation) From a2b577514744cc94252d1ad65598dcacc797ee5d Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 17 May 2024 13:14:23 -0500 Subject: [PATCH 3/7] updated error raising functions --- locidex/build.py | 4 ++-- locidex/classes/blast.py | 3 ++- locidex/classes/fasta.py | 8 +++++++- locidex/classes/gbk.py | 2 +- locidex/classes/seq_intake.py | 9 +++++++-- locidex/constants.py | 28 ++++++++++------------------ locidex/extract.py | 4 ++-- locidex/format.py | 5 +++-- locidex/manifest.py | 18 ++++++++++-------- locidex/merge.py | 15 ++++++++------- locidex/search.py | 2 +- 11 files changed, 53 insertions(+), 45 deletions(-) diff --git a/locidex/build.py b/locidex/build.py index b996147..0343705 100644 --- a/locidex/build.py +++ b/locidex/build.py @@ -7,7 +7,7 @@ from locidex.version import __version__ from locidex.constants import DBFiles from locidex.classes import run_command -from locidex.constants import DBConfig, MetadataFields +from locidex.constants import DBConfig, MetadataFields, raise_file_not_found_e from locidex.classes.blast import BlastMakeDB from locidex.manifest import DBData import getpass @@ -164,7 +164,7 @@ def run(cmd_args=None): if not os.path.isfile(input_file): logger.critical(f'Error {input_file} does not exist, please check path and try again') - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input_file)) + raise_file_not_found_e(input_file, logger) obj = locidex_build(Path(input_file), Path(outdir),config=config,seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=force) diff --git a/locidex/classes/blast.py b/locidex/classes/blast.py index c08c926..69219ff 100644 --- a/locidex/classes/blast.py +++ b/locidex/classes/blast.py @@ -52,7 +52,7 @@ def makeblastdb(self): logger.debug("Blast makedb stdout: {}".format(stdout)) if stderr: logger.debug("Blast makedb stderr: {}".format(stderr)) - + self.checkblastdb(self.output_db_path) return self.output_db_path @@ -142,6 +142,7 @@ def read_hit_table(self, blast_data): def _check_blast_files(self, db_dir: Path, extensions: frozenset): """ """ + logger.info("Verifying blast database: {}".format(str(db_dir))) extensions_ = set([i.suffix for i in db_dir.iterdir()]) if not extensions_.issuperset(extensions): raise ValueError("Missing required blast files. {}".format([i for i in extensions_ if i not in extensions])) diff --git a/locidex/classes/fasta.py b/locidex/classes/fasta.py index ada7536..5995b1f 100644 --- a/locidex/classes/fasta.py +++ b/locidex/classes/fasta.py @@ -4,8 +4,14 @@ from functools import partial import os from locidex.utils import calc_md5, slots +import logging +from locidex.constants import raise_file_not_found_e from dataclasses import dataclass from pathlib import Path +import sys + +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) @dataclass class Fasta: @@ -24,7 +30,7 @@ class ParseFasta: def __init__(self, input_file: Path,parse_def=False,seq_type=None,delim="|"): self.input_file = input_file if not self.input_file.exists(): - raise FileNotFoundError("Input file: {} not found.".format(self.input_file)) + raise_file_not_found_e(self.input_file, logger) self.delim = delim self.seq_type = seq_type diff --git a/locidex/classes/gbk.py b/locidex/classes/gbk.py index a1a4443..5e6ba8b 100644 --- a/locidex/classes/gbk.py +++ b/locidex/classes/gbk.py @@ -6,11 +6,11 @@ from locidex.utils import revcomp,calc_md5 class parse_gbk: - status = True messages = [] def __init__(self,input_file): self.input_file= input_file + self.status = True if not os.path.isfile(self.input_file): self.messages.append(f'Error {self.input_file} does not exist') self.status = False diff --git a/locidex/classes/seq_intake.py b/locidex/classes/seq_intake.py index 7cb05f5..aeaaea7 100644 --- a/locidex/classes/seq_intake.py +++ b/locidex/classes/seq_intake.py @@ -5,10 +5,14 @@ from locidex.classes.fasta import ParseFasta from locidex.utils import guess_alphabet, calc_md5, six_frame_translation, slots from locidex.classes.prodigal import gene_prediction -from locidex.constants import DNA_AMBIG_CHARS, DNA_IUPAC_CHARS, CharacterConstants, DBConfig +from locidex.constants import DNA_AMBIG_CHARS, DNA_IUPAC_CHARS, CharacterConstants, DBConfig, raise_file_not_found_e +import logging from typing import NamedTuple, Optional, List from dataclasses import dataclass, asdict +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.INFO) + @dataclass class HitFilters: min_dna_len: int @@ -50,7 +54,8 @@ class seq_intake: def __init__(self,input_file,file_type,feat_key='CDS',translation_table=11,perform_annotation=False,num_threads=1,skip_trans=False): if not input_file.exists(): - raise FileNotFoundError("File {} does not exist.".format(input_file)) + logger.critical("Seq_intake file {} could not be found".format(input_file)) + raise_file_not_found_e(input_file, logger) self.input_file = input_file self.file_type = file_type diff --git a/locidex/constants.py b/locidex/constants.py index a3a8ca3..d7e543d 100644 --- a/locidex/constants.py +++ b/locidex/constants.py @@ -1,6 +1,9 @@ from dataclasses import dataclass, asdict, fields import pathlib +import logging +import os +import errno from typing import Any, Union, NamedTuple, Optional DNA_AMBIG_CHARS = ['b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 'u', 'v', 'w', 'x', @@ -23,24 +26,13 @@ class CharacterConstants: stop_codon: str = "*" -#BLAST_TABLE_COLS = ''' -#qseqid -#sseqid -#qlen -#slen -#qstart -#qend -#sstart -#send -#length -#mismatch -#pident -#qcovhsp -#qcovs -#sstrand -#evalue -#bitscore -#'''.strip().split('\n') +def raise_file_not_found_e(file: Union[str, pathlib.Path], logger: logging.Logger): + """ + Raise a file not found error. + """ + logger.critical("File supplied does not exist: {}".format(str(file))) + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(file)) + class BlastColumns(NamedTuple): qseqid: str diff --git a/locidex/extract.py b/locidex/extract.py index ad89af3..fd3c4b5 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -15,7 +15,7 @@ from locidex.manifest import DBData from locidex.classes.db import search_db_conf, db_config from locidex.classes.seq_intake import seq_intake, seq_store -from locidex.constants import SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, BlastCommands, DBConfig, DB_EXPECTED_FILES, NT_SUB, EXTRACT_MODES, OPTION_GROUPS +from locidex.constants import SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, BlastCommands, DBConfig, DB_EXPECTED_FILES, EXTRACT_MODES, raise_file_not_found_e from locidex.version import __version__ from locidex.classes.aligner import perform_alignment, aligner from locidex.utils import check_db_groups @@ -182,7 +182,7 @@ def run_extract(config): nt_db = Path("{}.fasta".format(db_data.nucleotide_blast_db)) if not nt_db.exists(): logger.critical("Could not find file: {}".format(nt_db)) - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(nt_db)) + raise_file_not_found_e(nt_db, logger) filter_options = { 'evalue': FilterOptions(min=None, max=min_evalue, include=None), diff --git a/locidex/format.py b/locidex/format.py index d2395fa..3b412a6 100644 --- a/locidex/format.py +++ b/locidex/format.py @@ -16,7 +16,7 @@ from Bio.Seq import Seq from pyrodigal import GeneFinder -from locidex.constants import FILE_TYPES, LocidexDBHeader, CharacterConstants +from locidex.constants import FILE_TYPES, LocidexDBHeader, CharacterConstants, raise_file_not_found_e from locidex.utils import six_frame_translation, revcomp, calc_md5 from locidex.version import __version__ @@ -234,7 +234,8 @@ def run(cmd_args=None): if not os.path.isdir(input) and not os.path.isfile(input): logger.critical(f'Error {input} does not exist as a file or directory') - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input)) + raise_file_not_found_e(input, logger) + logger.info("Beginning format operation.") obj = locidex_format(input=input,header=LocidexDBHeader._fields,is_protein=is_coding,min_len_frac=min_len_frac,max_len_frac=max_len_frac, min_ident_perc=min_ident, min_cov_perc=min_match_cov,trans_table=trans_table,valid_ext=FILE_TYPES['fasta']) diff --git a/locidex/manifest.py b/locidex/manifest.py index 1f8d0ea..16db42f 100644 --- a/locidex/manifest.py +++ b/locidex/manifest.py @@ -8,7 +8,7 @@ from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter) from datetime import datetime from locidex.version import __version__ -from locidex.constants import DBConfig, DBFiles +from locidex.constants import DBConfig, DBFiles, raise_file_not_found_e import logging import errno @@ -58,14 +58,14 @@ def nucleotide_name(cls): def nucleotide_blast_db(self): if self.nucleotide is None: logger.critical("Nucleotide blast database does not exist.") - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self.nucleotide)) + raise_file_not_found_e(self.nucleotide, logger) return self.nucleotide / self.__nucleotide_path @property def protein_blast_db(self): if self.protein is None: logger.critical("Protein blast database does not exist.") - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self.protein)) + raise_file_not_found_e(self.protein, logger) return self.protein / self.__protein_path def _get_config(self, db_dir: pathlib.Path) -> DBConfig: @@ -78,7 +78,7 @@ def _get_metadata(self, db_dir: pathlib.Path) -> dict: metadata_file = db_dir.joinpath(DBFiles.meta_file) if not metadata_file.exists(): logger.critical("Metadata file does not appear to exist in db: {}".format(db_dir)) - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(metadata_file)) + raise_file_not_found_e(str(metadata_file), logger) md_data = None with open(metadata_file, 'r') as md: md_data = json.load(md) @@ -96,12 +96,12 @@ def _get_blast_dbs(self, db_dir: pathlib.Path, config_data: DBConfig) -> Tuple[O nucleotide = blast_db.joinpath(self.__nucleotide_path) if not nucleotide.exists(): logger.critical("Cannot find nucleotide database, but it should exist. {}".format(nucleotide)) - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(nucleotide)) + raise_file_not_found_e(nucleotide, logger) if config_data.is_prot: protein = blast_db.joinpath(self.__protein_path) if not protein.exists(): logger.critical("Cannot find protein database, but it should exist. {}".format(protein)) - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(protein)) + raise_file_not_found_e(protein, logger) return nucleotide, protein @@ -171,7 +171,9 @@ def check_config(directory: pathlib.Path) -> DBConfig: config_dir = pathlib.Path(directory).joinpath(DBFiles.config_file) config_data: Optional[DBConfig] = None if not config_dir.exists(): - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(config_dir)) + logger.critical("Could not find config file: {}".format(config_dir)) + raise_file_not_found_e(config_dir, logger) + with open(config_dir, 'r') as conf: config_data = DBConfig(**json.load(conf)) for k, v in config_data.to_dict().items(): @@ -193,7 +195,7 @@ def validate_db_files(allele_dir: List[pathlib.Path], file_in: pathlib.Path) -> for k, v in DBFiles.items(): if not pathlib.Path(a_dir / v).exists(): logger.critical("Required file {} does not exist.".format(k)) - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(a_dir / v)) + raise_file_not_found_e(str(a_dir / v), logger) db_configs.append((a_dir.relative_to(file_in), check_config(a_dir))) return db_configs diff --git a/locidex/merge.py b/locidex/merge.py index c4f9182..57919d2 100644 --- a/locidex/merge.py +++ b/locidex/merge.py @@ -12,7 +12,7 @@ import logging import pandas as pd from locidex.classes.aligner import align, parse_align -from locidex.constants import DBConfig +from locidex.constants import DBConfig, raise_file_not_found_e from locidex.report import ReportData, Data, Parameters from locidex.version import __version__ @@ -51,16 +51,17 @@ def get_file_list(input_files): file_list = input_files else: if not os.path.isfile(input_files[0]): - logger.critical(f'Error the supplied file {input_files[0]} does not exist') - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(input_files[0])) + logger.critical("File {} does not exists".format(input_files[0])) + raise_file_not_found_e(input_files[0], logger=logger) + encoding = guess_type(input_files[0])[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open with _open(input_files[0]) as f: for line in f: line = line.rstrip() if not os.path.isfile(line): - logger.critical(f'Error the supplied file {line} does not exist') - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(line)) + logger.critical("Could not find file: {}".format(line)) + raise_file_not_found_e(line, logger) file_list.append(line) return file_list @@ -93,8 +94,8 @@ def check_files_exist(file_list: list[os.PathLike]) -> None: """ for file in file_list: if not os.path.isfile(file): - logger.critical(f"Error cannot open input file {file}") - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(file)) + logger.critical("Could not find file: {}".format(file)) + raise_file_not_found_e(file, logger) def read_file_list(file_list,perform_validation=False): diff --git a/locidex/search.py b/locidex/search.py index 802e7ff..2009626 100644 --- a/locidex/search.py +++ b/locidex/search.py @@ -15,7 +15,7 @@ from locidex.classes.db import search_db_conf, db_config from locidex.manifest import DBData from locidex.classes.seq_intake import seq_intake, seq_store, HitFilters -from locidex.constants import BlastCommands, SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, DB_EXPECTED_FILES, OPTION_GROUPS, DBConfig +from locidex.constants import BlastCommands, SEARCH_RUN_DATA, FILE_TYPES, BlastColumns, DB_EXPECTED_FILES, OPTION_GROUPS, DBConfig from locidex.utils import write_seq_dict, check_db_groups, slots from locidex.version import __version__ From fbf2914639d970ef5b03faf58dc30e35d26391a4 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 17 May 2024 13:25:36 -0500 Subject: [PATCH 4/7] updated merge tests --- tests/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_merge.py b/tests/test_merge.py index 7fe1066..74a1066 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -16,7 +16,7 @@ def test_throws_duplicate_error(): """ """ - with pytest.raises(SystemExit, match="Attempting to merge allele profiles with the same sample name: 1"): + with pytest.raises(ValueError, match="Attempting to merge allele profiles with the same sample name: 1"): merge.read_file_list(DUPLICATE_NAMES, perform_validation=True) def test_merge_profiles_no_error(): @@ -24,7 +24,7 @@ def test_merge_profiles_no_error(): def test_check_files_exist(): - with pytest.raises(SystemExit, match=str(errno.ENOENT)): + with pytest.raises(FileNotFoundError, match=str(errno.ENOENT)): fail = "dne.txt" merge.check_files_exist([fail]) From 574e7c636c762c3694649430641682b1163fca6f Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 21 May 2024 15:05:20 -0500 Subject: [PATCH 5/7] removed print statements --- locidex/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locidex/extract.py b/locidex/extract.py index fd3c4b5..8f1283c 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -100,7 +100,7 @@ def run_extract(config): if not mode in EXTRACT_MODES: - print(f'Provided mode for allele extraction is not valid: {mode}, needs to be one of (snps, trim, extend, raw)') + logger.critical('Provided mode for allele extraction is not valid: {}, needs to be one of ({})'.format(mode, ", ".join(EXTRACT_MODES))) sys.exit() if sample_name == None: From c186657fb8e240348535897636e0c31e4f74f57c Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Tue, 21 May 2024 15:25:03 -0500 Subject: [PATCH 6/7] updated exit status exit codes --- .gitignore | 3 ++- locidex/main.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 91b0132..16ac543 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ .vscode .pytest_cache tmp -.coverage* \ No newline at end of file +.coverage* +error.txt \ No newline at end of file diff --git a/locidex/main.py b/locidex/main.py index 8e9761d..59e5e6e 100644 --- a/locidex/main.py +++ b/locidex/main.py @@ -43,8 +43,9 @@ def main(argv=None): main() except Exception as e: with open(error_file, "w") as f: - traceback.TracebackException.from_exception(e).print(file=f) + f.write(traceback.format_exc()) + error_number = e.errno if hasattr(e, "errno") else -1 logger.critical("Program exited with errors, please review logs. For the full traceback please see file: {}".format(error_file)) - sys.exit(-1) + raise SystemExit(error_number) else: sys.exit("Program finished without errors.") \ No newline at end of file From fd3bb1c2998b94a2cc29ba09e380a33f4e7947b1 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Wed, 22 May 2024 09:56:07 -0500 Subject: [PATCH 7/7] fixed conservative mode key error --- locidex/report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locidex/report.py b/locidex/report.py index 35dec76..f3a0680 100644 --- a/locidex/report.py +++ b/locidex/report.py @@ -132,7 +132,7 @@ def filter_queries(self): if self.mode == 'conservative': count_internal_stop = self.query_seq_data[seq_id]['count_internal_stop'] start_codon = self.query_seq_data[seq_id]["start_codon"] - stop_codon = self.query_seq_data[seq_id]["stop_codon"] + stop_codon = self.query_seq_data[seq_id]["end_codon"] if start_codon not in START_CODONS or stop_codon not in STOP_CODONS or count_internal_stop > 0: failed_seqids.add(seq_id)