From a8b726b903a31c66c0f8c90726bd3550bc535e9c Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 13 May 2024 10:07:06 -0500 Subject: [PATCH 1/4] adding logging functionality --- locidex/merge.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/locidex/merge.py b/locidex/merge.py index 1c7ccf4..cb41386 100644 --- a/locidex/merge.py +++ b/locidex/merge.py @@ -8,10 +8,13 @@ from functools import partial from mimetypes import guess_type from multiprocessing import Pool, cpu_count +import logging import pandas as pd from locidex.classes.aligner import align, parse_align from locidex.version import __version__ +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.INFO) def add_args(parser=None): if parser is None: @@ -42,7 +45,7 @@ def get_file_list(input_files): file_list = input_files else: if not os.path.isfile(input_files[0]): - print(f'Error the supplied file {input_files[0]} does not exist') + logger.critical(f'Error the supplied file {input_files[0]} does not exist') sys.exit() encoding = guess_type(input_files[0])[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open @@ -50,7 +53,7 @@ def get_file_list(input_files): for line in f: line = line.rstrip() if not os.path.isfile(line): - print(f'Error the supplied file {line} does not exist') + logger.critical(f'Error the supplied file {line} does not exist') sys.exit() file_list.append(line) return file_list @@ -60,8 +63,8 @@ def read_file_list(file_list,perform_validation=False): db_info = {} for f in file_list: if not os.path.isfile(f): - print(f"Error cannot open input file {f}") - sys.exit + logger.critical(f"Error cannot open input file {f}") + sys.exit() encoding = guess_type(f)[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open with _open(f) as fh: From 2ac558721c120f122997c3137132be6e400de9fb Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 13 May 2024 15:10:53 -0500 Subject: [PATCH 2/4] updated locidex merge and unified classes between report and merge --- locidex/example/merge/merge_in/report.json | 36 +++ locidex/example/merge/merge_in/report1.json | 36 +++ locidex/example/merge/merge_in/report2.json | 36 +++ locidex/example/merge/merge_out/profile.tsv | 3 + locidex/example/merge/merge_out/run.json | 24 ++ locidex/merge.py | 250 ++++++++++++-------- locidex/report.py | 101 ++++++-- tests/test_merge.py | 23 ++ 8 files changed, 380 insertions(+), 129 deletions(-) create mode 100644 locidex/example/merge/merge_in/report.json create mode 100644 locidex/example/merge/merge_in/report1.json create mode 100644 locidex/example/merge/merge_in/report2.json create mode 100644 locidex/example/merge/merge_out/profile.tsv create mode 100644 locidex/example/merge/merge_out/run.json create mode 100644 tests/test_merge.py diff --git a/locidex/example/merge/merge_in/report.json b/locidex/example/merge/merge_in/report.json new file mode 100644 index 0000000..8030e0e --- /dev/null +++ b/locidex/example/merge/merge_in/report.json @@ -0,0 +1,36 @@ +{ + "db_info": { + "db_name": "Locidex Database", + "db_version": "1.0.0", + "db_date": "2024/30/04", + "db_author": "mw", + "db_desc": "test", + "db_num_seqs": 53, + "is_nucl": true, + "is_prot": true, + "nucleotide_db_name": "nucleotide", + "protein_db_name": "protein" + }, + "parameters": { + "mode": "normal", + "min_match_ident": 100, + "min_match_cov": 100, + "max_ambiguous": 0, + "max_internal_stops": 0 + }, + "data": { + "sample_name": "1", + "profile": { + "1": { + "aroC": "9048803cd72dee3c868cd2dc5dc5650d", + "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d", + "hemD": "620f99723c4e190abe096b11ca34b944", + "hisD": "38027ac1ac34817584a176c7e575e97e", + "purE": "9855cbf4009439498bf84cacefce4d8f", + "sucA": "9289fc07cc8e93cfe0716e6f613cefdb", + "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d" + } + }, + "seq_data": {} + } +} \ No newline at end of file diff --git a/locidex/example/merge/merge_in/report1.json b/locidex/example/merge/merge_in/report1.json new file mode 100644 index 0000000..f71297a --- /dev/null +++ b/locidex/example/merge/merge_in/report1.json @@ -0,0 +1,36 @@ +{ + "db_info": { + "db_name": "Locidex Database", + "db_version": "1.0.0", + "db_date": "2024/30/04", + "db_author": "mw", + "db_desc": "test", + "db_num_seqs": 53, + "is_nucl": true, + "is_prot": true, + "nucleotide_db_name": "nucleotide", + "protein_db_name": "protein" + }, + "parameters": { + "mode": "normal", + "min_match_ident": 100, + "min_match_cov": 100, + "max_ambiguous": 0, + "max_internal_stops": 0 + }, + "data": { + "sample_name": "2", + "profile": { + "2": { + "aroC": "9048803cd72dee3c868cd2dc5dc5650d", + "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d", + "hemD": "620f99723c4e190abe096b11ca34b944", + "hisD": "38027ac1ac34817584a176c7e575e97e", + "purE": "9855cbf4009439498bf84cacefce4d8f", + "sucA": "9289fc07cc8e93cfe0716e6f613cefdb", + "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d" + } + }, + "seq_data": {} + } +} \ No newline at end of file diff --git a/locidex/example/merge/merge_in/report2.json b/locidex/example/merge/merge_in/report2.json new file mode 100644 index 0000000..8030e0e --- /dev/null +++ b/locidex/example/merge/merge_in/report2.json @@ -0,0 +1,36 @@ +{ + "db_info": { + "db_name": "Locidex Database", + "db_version": "1.0.0", + "db_date": "2024/30/04", + "db_author": "mw", + "db_desc": "test", + "db_num_seqs": 53, + "is_nucl": true, + "is_prot": true, + "nucleotide_db_name": "nucleotide", + "protein_db_name": "protein" + }, + "parameters": { + "mode": "normal", + "min_match_ident": 100, + "min_match_cov": 100, + "max_ambiguous": 0, + "max_internal_stops": 0 + }, + "data": { + "sample_name": "1", + "profile": { + "1": { + "aroC": "9048803cd72dee3c868cd2dc5dc5650d", + "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d", + "hemD": "620f99723c4e190abe096b11ca34b944", + "hisD": "38027ac1ac34817584a176c7e575e97e", + "purE": "9855cbf4009439498bf84cacefce4d8f", + "sucA": "9289fc07cc8e93cfe0716e6f613cefdb", + "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d" + } + }, + "seq_data": {} + } +} \ No newline at end of file diff --git a/locidex/example/merge/merge_out/profile.tsv b/locidex/example/merge/merge_out/profile.tsv new file mode 100644 index 0000000..6044081 --- /dev/null +++ b/locidex/example/merge/merge_out/profile.tsv @@ -0,0 +1,3 @@ +sample_id aroC dnaN hemD hisD purE sucA thrA +1 9048803cd72dee3c868cd2dc5dc5650d 2772ad8b8e0f7b50f1396c31fbe53f2d 620f99723c4e190abe096b11ca34b944 38027ac1ac34817584a176c7e575e97e 9855cbf4009439498bf84cacefce4d8f 9289fc07cc8e93cfe0716e6f613cefdb 9e1aa76bb42279ed7ec8fc30f984b65d +2 9048803cd72dee3c868cd2dc5dc5650d 2772ad8b8e0f7b50f1396c31fbe53f2d 620f99723c4e190abe096b11ca34b944 38027ac1ac34817584a176c7e575e97e 9855cbf4009439498bf84cacefce4d8f 9289fc07cc8e93cfe0716e6f613cefdb 9e1aa76bb42279ed7ec8fc30f984b65d diff --git a/locidex/example/merge/merge_out/run.json b/locidex/example/merge/merge_out/run.json new file mode 100644 index 0000000..e4f8448 --- /dev/null +++ b/locidex/example/merge/merge_out/run.json @@ -0,0 +1,24 @@ +{ + "analysis_start_time": "13/05/2024 14:53:14", + "parameters": { + "command": "merge", + "input": [ + [ + "./locidex/example/merge/merge_in/report.json", + "./locidex/example/merge/merge_in/report1.json" + ] + ], + "outdir": "./locidex/example/merge/merge_out/", + "n_threads": 1, + "linker": "NNNNNNNNNNNNNNNNNNNN", + "strict": false, + "align": false, + "force": true + }, + "result_file": "./locidex/example/merge/merge_out/profile.tsv", + "count_valid_loci": 0, + "count_invalid_loci": 0, + "valid_loci": "", + "invalid_loci": "", + "analysis_end_time": "13/05/2024 14:53:15" +} \ No newline at end of file diff --git a/locidex/merge.py b/locidex/merge.py index cb41386..5c79d3b 100644 --- a/locidex/merge.py +++ b/locidex/merge.py @@ -3,6 +3,7 @@ import os import re import sys +import errno from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter) from datetime import datetime from functools import partial @@ -11,26 +12,31 @@ import logging import pandas as pd from locidex.classes.aligner import align, parse_align +from locidex.constants import DBConfig +from locidex.report import ReportData, Data, Parameters from locidex.version import __version__ logger = logging.getLogger(__name__) logging.basicConfig(filemode=sys.stderr, level=logging.INFO) def add_args(parser=None): + """ + TODO disabling alignment until test data is prepared + """ if parser is None: parser = ArgumentParser( description="Locidex merge: Concatonate set of input profile.json files into a tsv table or aligned fasta") parser.add_argument('-i','--input', type=str, required=True,help='Input file to report', action='append', nargs='+') parser.add_argument('-o', '--outdir', type=str, required=True, help='Output file to put results') - parser.add_argument('--n_threads','-t', type=int, required=False, - help='CPU Threads to use', default=1) - parser.add_argument('--linker','-l', type=str, required=False, - help='Linker sequence for alignment', default='NNNNNNNNNNNNNNNNNNNN') + #parser.add_argument('--n_threads','-t', type=int, required=False, + # help='CPU Threads to use', default=1) + #parser.add_argument('--linker','-l', type=str, required=False, + # help='Linker sequence for alignment', default='NNNNNNNNNNNNNNNNNNNN') parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__) parser.add_argument('-s', '--strict', required=False, help='Only merge data produces by the same db', action='store_true') - parser.add_argument('-a', '--align', required=False, help='Perform alignment with individual loci to produce a concatenated alignment', - action='store_true') + #parser.add_argument('-a', '--align', required=False, help='Perform alignment with individual loci to produce a concatenated alignment', + # action='store_true') parser.add_argument('-f', '--force', required=False, help='Overwrite existing directory', action='store_true') return parser @@ -46,7 +52,7 @@ def get_file_list(input_files): else: if not os.path.isfile(input_files[0]): logger.critical(f'Error the supplied file {input_files[0]} does not exist') - sys.exit() + sys.exit(errno.ENOENT) encoding = guess_type(input_files[0])[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open with _open(input_files[0]) as f: @@ -54,47 +60,80 @@ def get_file_list(input_files): line = line.rstrip() if not os.path.isfile(line): logger.critical(f'Error the supplied file {line} does not exist') - sys.exit() + sys.exit(errno.ENOENT) file_list.append(line) return file_list +def validate_input_file(data_in: dict, db_version: str, db_name: str, perform_validation: bool) -> tuple[ReportData, str, str]: + """ + Validate input data for usage verifying db_versions and db_names are the same + """ + + try: + sq_data = ReportData.deseriealize(data_in) + except KeyError: + logger.critical("Missing fields in configuration required fields in in reported allele file. Fields required: {}".format(ReportData.fields())) + sys.exit() + else: + + if db_version is not None and sq_data.db_info.db_version != db_version and perform_validation: + logger.critical("You are attempting to merge files that were created using different database versions.") + sys.exit() + + if db_name is not None and sq_data.db_info.db_name != db_name and perform_validation: + logger.critical("You are attempting to merge files that have different names.") + sys.exit() + + return sq_data, sq_data.db_info.db_version, sq_data.db_info.db_name + +def check_files_exist(file_list: list[os.PathLike]) -> None: + """ + Verify that all files to be analyzed exist + """ + for file in file_list: + if not os.path.isfile(file): + logger.critical(f"Error cannot open input file {file}") + sys.exit(errno.ENOENT) + + def read_file_list(file_list,perform_validation=False): records = {} - db_info = {} + db_version = None + db_name = None + + check_files_exist(file_list) + for f in file_list: - if not os.path.isfile(f): - logger.critical(f"Error cannot open input file {f}") - sys.exit() encoding = guess_type(f)[1] _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open with _open(f) as fh: data = json.load(fh) - if 'db_info' not in data: - print("Error malformed or invalid input file {}, missing 'db_info'".format(f)) - sys.exit() - if len(db_info) == 0: - db_info = data['db_info'] - if db_info["db_name"] != data['db_info']["db_name"] or \ - db_info["db_version"] != data['db_info']["db_version"]: - print("Error you are attempting to merge files generated with different databases {} vs. {}: {}".format(db_info,f,data['db_info'])) - sys.exit() - records[data['data']['sample_name']] = data + sq_data, db_version, db_name = validate_input_file(data, + db_version=db_version, + db_name=db_name, + perform_validation=perform_validation) + + sample_name = sq_data.data.sample_name + if records.get(sq_data.data.sample_name) is None: + records[sample_name] = sq_data + else: + logger.critical("Duplicate sample name detected: {}".format(sq_data.data.sample_name)) + sys.exit("Attempting to merge allele profiles with the same sample name: {}".format(sq_data.data.sample_name)) return records def extract_profiles(records): profile = {} for id in records: - for sample_name in records[id]['data']['profile']: - profile[sample_name] = records[id]['data']['profile'][sample_name] + for sample_name in records[id].data.profile: + if profile.get(sample_name) is not None: + logger.critical("Sample {} already exists and will not be added.") + profile[sample_name] = records[id].data.profile[sample_name] return profile def extract_seqs(records): seqs = {} for id in records: - if not 'seq_data' in records[id]['data']: - continue - - seqs[id] = records[id]['data']['seq_data'] + seqs[id] = records[id].data.seq_data return seqs def write_gene_fastas(seq_data,work_dir): @@ -134,9 +173,12 @@ def run_merge(config): #Input Parameters input_files = config['input'][0] outdir = config['outdir'] - perform_align = config['align'] - linker_seq = config['linker'] - n_threads = config['n_threads'] + ### + # Commented out as these changes will require test data + # perform_align = config['align'] + # linker_seq = config['linker'] + # n_threads = config['n_threads'] + ### force = config['force'] validate_db = config['strict'] if validate_db is None or validate_db == '': @@ -148,8 +190,8 @@ def run_merge(config): run_data['parameters'] = analysis_parameters if os.path.isdir(outdir) and not force: - print(f'Error {outdir} exists, if you would like to overwrite, then specify --force') - sys.exit() + logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force') + sys.exit(errno.EEXIST) if not os.path.isdir(outdir): os.makedirs(outdir, 0o755) @@ -160,80 +202,82 @@ def run_merge(config): #create profile df = pd.DataFrame.from_dict(extract_profiles(records), orient='index') - df.insert(loc=0, - column='sample_id', - value=df.index.tolist()) + df.insert(loc=0, column='sample_id', value=df.index.tolist()) df.to_csv(os.path.join(outdir,'profile.tsv'),index=False,header=True,sep="\t") - sample_names = list(df['sample_id']) + del(df) run_data['result_file'] = os.path.join(outdir,"profile.tsv") - + #create alignment - if perform_align and len(records) > 1: - pass - work_dir = os.path.join(outdir,"raw_gene_fastas") - if not os.path.isdir(work_dir): - os.makedirs(work_dir, 0o755) - - seq_data = extract_seqs(records) - gene_files = write_gene_fastas(seq_data,work_dir) - del(records) - del(seq_data) - pool = Pool(processes=n_threads) - - results = [] - for locus_name in gene_files: - results.append(pool.apply_async(align, args=((gene_files[locus_name]['file'],)))) - - pool.close() - pool.join() - - r = [] - for x in results: - if isinstance(x, dict): - r.append(x) - else: - r.append(x.get()) - results = r - loci_names = list(gene_files.keys()) - alignment = {} - - - for i in range(0,len(results)): - alignment[loci_names[i]] = parse_align(results[i][0]) - results[i] = None - del(results) - - loci_lengths = {} - for sample_id in sample_names: - for locus_name in loci_names: - if sample_id not in alignment[locus_name]: - continue - loci_lengths[locus_name] = len(alignment[locus_name][sample_id]) - - - out_align = os.path.join(outdir,'loci_alignment.fas') - oh = open(out_align,'w') - invalid_loci = set() - for sample_id in sample_names: - seq = [] - for locus_name in loci_names: - if locus_name not in loci_lengths: - invalid_loci.add(locus_name) - continue - if sample_id in alignment[locus_name]: - seq.append(alignment[locus_name][sample_id]) - else: - seq.append(''.join(['-']*loci_lengths[locus_name])) - seq.append(linker_seq) - oh.write('>{}\n{}\n'.format(sample_id,"".join(seq))) - oh.close() - run_data['alignment_file'] = out_align - - run_data['count_valid_loci'] = len(loci_lengths.keys()) - run_data['count_invalid_loci'] = len(list(invalid_loci)) - run_data['valid_loci'] = ",".join(list(loci_lengths.keys())) - run_data['invalid_loci'] = ",".join(list(invalid_loci)) + # TODO Bring this back in when test data is provided + #sample_names = list(df['sample_id']) + #loci_lengths = {} + #invalid_loci = set() + #if perform_align and len(records) > 1: + # pass + # work_dir = os.path.join(outdir,"raw_gene_fastas") + # if not os.path.isdir(work_dir): + # os.makedirs(work_dir, 0o755) + # + # seq_data = extract_seqs(records) + # gene_files = write_gene_fastas(seq_data,work_dir) + # del(records) + # del(seq_data) + # pool = Pool(processes=n_threads) + + # results = [] + # for locus_name in gene_files: + # results.append(pool.apply_async(align, args=((gene_files[locus_name]['file'],)))) + + # pool.close() + # pool.join() + + # r = [] + # for x in results: + # if isinstance(x, dict): + # r.append(x) + # else: + # r.append(x.get()) + # results = r + # loci_names = list(gene_files.keys()) + # alignment = {} + # + + # for i in range(0,len(results)): + # alignment[loci_names[i]] = parse_align(results[i][0]) + # results[i] = None + # del(results) + + # + # for sample_id in sample_names: + # for locus_name in loci_names: + # if sample_id not in alignment[locus_name]: + # continue + # loci_lengths[locus_name] = len(alignment[locus_name][sample_id]) + + + # out_align = os.path.join(outdir,'loci_alignment.fas') + # oh = open(out_align,'w') + # + # for sample_id in sample_names: + # seq = [] + # for locus_name in loci_names: + # if locus_name not in loci_lengths: + # invalid_loci.add(locus_name) + # continue + # if sample_id in alignment[locus_name]: + # seq.append(alignment[locus_name][sample_id]) + # else: + # seq.append(''.join(['-']*loci_lengths[locus_name])) + # seq.append(linker_seq) + # oh.write('>{}\n{}\n'.format(sample_id,"".join(seq))) + # oh.close() + # run_data['alignment_file'] = out_align + + #run_data['count_valid_loci'] = len(loci_lengths.keys()) + #run_data['count_invalid_loci'] = len(list(invalid_loci)) + #run_data['valid_loci'] = ",".join(list(loci_lengths.keys())) + #run_data['invalid_loci'] = ",".join(list(invalid_loci)) run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") with open(os.path.join(outdir,"run.json"),'w' ) as fh: fh.write(json.dumps(run_data, indent=4)) diff --git a/locidex/report.py b/locidex/report.py index ac4469b..5be3f32 100644 --- a/locidex/report.py +++ b/locidex/report.py @@ -4,14 +4,63 @@ from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter) from copy import deepcopy from datetime import datetime - +from dataclasses import dataclass, asdict, fields import pandas as pd +from typing import Any from locidex.classes.seq_intake import seq_intake -from locidex.constants import SEARCH_RUN_DATA, START_CODONS, STOP_CODONS +from locidex.constants import SEARCH_RUN_DATA, START_CODONS, STOP_CODONS, DBConfig from locidex.utils import calc_md5 from locidex.version import __version__ + +@dataclass +class Parameters: + mode: str + min_match_ident: str + min_match_cov: str + max_ambiguous: str + max_internal_stops: str + +@dataclass +class Data: + sample_name: str + profile: dict + seq_data: dict + + def __getitem__(self, name: str) -> Any: + return getattr(self, str(name)) + + def __setitem__(self, key: str, value: str) -> None: + setattr(self, key, value) + +@dataclass +class ReportData: + db_info: DBConfig + parameters: Parameters + data: Data + + def __getitem__(self, name: str) -> Any: + return getattr(self, str(name)) + + def __setitem__(self, key: str, value: str) -> None: + setattr(self, key, value) + + @classmethod + def fields(cls): + return fields(cls) + + @classmethod + def deseriealize(cls, input: dict): + """ + Return a ReportData object from deserialized json data + """ + return cls(db_info=DBConfig(**input["db_info"]), + parameters=Parameters(**input["parameters"]), + data=Data(**input["data"])) + + + def add_args(parser=None): if parser is None: @@ -385,41 +434,42 @@ def run_report(config): allele_obj.extract_hit_data('protein').to_csv(os.path.join(outdir, "protein.hits.txt"), header=True, sep="\t", index=False) - profile = { - "db_info":seq_store_dict["db_info"], - 'parameters':{ - 'mode':mode, - 'min_match_ident':match_ident, - 'min_match_cov':match_cov, - 'max_ambiguous':max_ambig, - 'max_internal_stops':max_int_stop - }, - 'data':{ - 'sample_name':sample_name, - 'profile':{sample_name: allele_obj.profile}, - 'seq_data':seq_data - } - } + profile = ReportData( + db_info=DBConfig(**seq_store_dict["db_info"]), + parameters= Parameters( + mode=mode, + min_match_ident=match_ident, + min_match_cov=match_cov, + max_ambiguous=max_ambig, + max_internal_stops=max_int_stop + ), + data = Data( + sample_name = sample_name, + profile = {sample_name: allele_obj.profile}, + seq_data=seq_data + ) + ) + - if len(profile['data']['seq_data']) > 0: + if len(profile.data.seq_data) > 0: # add locus information to seq_data look_up = {} - for locus_name in profile['data']['profile'][sample_name]: - h = profile['data']['profile'][sample_name][locus_name] + for locus_name in profile.data.profile[sample_name]: + h = profile.data.profile[sample_name][locus_name] if h not in look_up: look_up[h] = [] look_up[h].append(locus_name) - for seq_id in profile['data']['seq_data']: - h = profile['data']['seq_data'][seq_id]['dna_hash'] + for seq_id in profile.data.seq_data: + h = profile.data.seq_data[seq_id]['dna_hash'] if h in look_up: - profile['data']['seq_data'][seq_id]['locus_name'] = ",".join([str(x) for x in look_up[h]]) + profile.data.seq_data[seq_id]['locus_name'] = ",".join([str(x) for x in look_up[h]]) else: - profile['data']['seq_data'][seq_id]['locus_name'] = '' + profile.data.seq_data[seq_id]['locus_name'] = '' with open(os.path.join(outdir,"report.json"),"w") as out: - json.dump(profile,out,indent=4) + json.dump(profile,out,indent=4, default=lambda o: o.__dict__) run_data['result_file'] = os.path.join(outdir,"report.json") run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") @@ -428,7 +478,6 @@ def run_report(config): def run(cmd_args=None): - #cmd_args = parse_args() if cmd_args is None: parser = add_args() cmd_args = parser.parse_args() diff --git a/tests/test_merge.py b/tests/test_merge.py new file mode 100644 index 0000000..593247b --- /dev/null +++ b/tests/test_merge.py @@ -0,0 +1,23 @@ +import pytest +from locidex import merge + + +DUPLICATE_NAMES = [ + "locidex/example/merge/merge_in/report.json", + "locidex/example/merge/merge_in/report2.json", +] + +MERGE_SUCCESSFULLY = [ + "locidex/example/merge/merge_in/report.json", + "locidex/example/merge/merge_in/report1.json", +] + +def test_throws_duplicate_error(): + """ + """ + with pytest.raises(SystemExit, match="Attempting to merge allele profiles with the same sample name: 1"): + merge.read_file_list(DUPLICATE_NAMES, perform_validation=True) + +def test_merge_profiles_no_error(): + merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True) + \ No newline at end of file From 56b1d288369433b9716aada288d6e146ca0e5ea6 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 13 May 2024 16:06:41 -0500 Subject: [PATCH 3/4] updated merge tests --- .gitignore | 3 ++- locidex/merge.py | 5 +++-- tests/test_merge.py | 17 ++++++++++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index ea3ec62..91b0132 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__ *.egg* .vscode .pytest_cache -tmp \ No newline at end of file +tmp +.coverage* \ No newline at end of file diff --git a/locidex/merge.py b/locidex/merge.py index 5c79d3b..526de67 100644 --- a/locidex/merge.py +++ b/locidex/merge.py @@ -208,8 +208,9 @@ def run_merge(config): del(df) run_data['result_file'] = os.path.join(outdir,"profile.tsv") - #create alignment - # TODO Bring this back in when test data is provided + ######### create alignment ############### + # Bring this back in when test data is provided + ############################################ #sample_names = list(df['sample_id']) #loci_lengths = {} #invalid_loci = set() diff --git a/tests/test_merge.py b/tests/test_merge.py index 593247b..27e3b16 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -1,5 +1,6 @@ import pytest from locidex import merge +import errno DUPLICATE_NAMES = [ @@ -20,4 +21,18 @@ def test_throws_duplicate_error(): def test_merge_profiles_no_error(): merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True) - \ No newline at end of file + + +def test_check_files_exist(): + with pytest.raises(SystemExit, match=str(errno.ENOENT)): + fail = "dne.txt" + merge.check_files_exist([fail]) + + merge.check_files_exist(MERGE_SUCCESSFULLY) + +def test_extract_profiles(): + records = merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True) + extracted_profiles = merge.extract_profiles(records) + assert len(extracted_profiles) == 2 + key1, key2 = extracted_profiles.keys() + assert extracted_profiles[key1] == extracted_profiles[key2] \ No newline at end of file From b1045064d06aa3e2e15f4d5916521101824cac8e Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Mon, 13 May 2024 16:12:05 -0500 Subject: [PATCH 4/4] added more merge tests --- tests/test_merge.py | 5 +++-- tests/test_workflows.yml | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test_merge.py b/tests/test_merge.py index 27e3b16..7fe1066 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -34,5 +34,6 @@ def test_extract_profiles(): records = merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True) extracted_profiles = merge.extract_profiles(records) assert len(extracted_profiles) == 2 - key1, key2 = extracted_profiles.keys() - assert extracted_profiles[key1] == extracted_profiles[key2] \ No newline at end of file + value1, value2 = extracted_profiles.values() + assert value1 == value2 + assert value1 == {'aroC': '9048803cd72dee3c868cd2dc5dc5650d', 'dnaN': '2772ad8b8e0f7b50f1396c31fbe53f2d', 'hemD': '620f99723c4e190abe096b11ca34b944', 'hisD': '38027ac1ac34817584a176c7e575e97e', 'purE': '9855cbf4009439498bf84cacefce4d8f', 'sucA': '9289fc07cc8e93cfe0716e6f613cefdb', 'thrA': '9e1aa76bb42279ed7ec8fc30f984b65d'} \ No newline at end of file diff --git a/tests/test_workflows.yml b/tests/test_workflows.yml index 281e420..57bc82a 100644 --- a/tests/test_workflows.yml +++ b/tests/test_workflows.yml @@ -38,4 +38,10 @@ command: locidex format -i locidex/example/format_db_mlst_in -o format_out files: - path: format_out/locidex.txt - - path: format_out/results.json \ No newline at end of file + - path: format_out/results.json + +- name: Run Merge + command: locidex merge -i locidex/example/merge/merge_in/report.json locidex/example/merge/merge_in/report1.json -o merged_out/ + files: + - path: merged_out/profile.tsv + md5sum: 4cb7a14f4b4e7d0ef4992d477d773275 \ No newline at end of file