From a8b726b903a31c66c0f8c90726bd3550bc535e9c Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 13 May 2024 10:07:06 -0500
Subject: [PATCH 1/4] adding logging functionality

---
 locidex/merge.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/locidex/merge.py b/locidex/merge.py
index 1c7ccf4..cb41386 100644
--- a/locidex/merge.py
+++ b/locidex/merge.py
@@ -8,10 +8,13 @@
 from functools import partial
 from mimetypes import guess_type
 from multiprocessing import Pool, cpu_count
+import logging
 import pandas as pd
 from locidex.classes.aligner import align, parse_align
 from locidex.version import __version__
 
+logger = logging.getLogger(__name__)
+logging.basicConfig(filemode=sys.stderr, level=logging.INFO)
 
 def add_args(parser=None):
     if parser is None:
@@ -42,7 +45,7 @@ def get_file_list(input_files):
             file_list = input_files
         else:
             if not os.path.isfile(input_files[0]):
-                print(f'Error the supplied file {input_files[0]} does not exist')
+                logger.critical(f'Error the supplied file {input_files[0]} does not exist')
                 sys.exit()
             encoding = guess_type(input_files[0])[1]
             _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
@@ -50,7 +53,7 @@ def get_file_list(input_files):
                 for line in f:
                     line = line.rstrip()
                     if not os.path.isfile(line):
-                        print(f'Error the supplied file {line} does not exist')
+                        logger.critical(f'Error the supplied file {line} does not exist')
                         sys.exit()
                     file_list.append(line)
     return file_list
@@ -60,8 +63,8 @@ def read_file_list(file_list,perform_validation=False):
     db_info = {}
     for f in file_list:
         if not os.path.isfile(f):
-            print(f"Error cannot open input file {f}")
-            sys.exit
+            logger.critical(f"Error cannot open input file {f}")
+            sys.exit()
         encoding = guess_type(f)[1]
         _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
         with _open(f) as fh:

From 2ac558721c120f122997c3137132be6e400de9fb Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 13 May 2024 15:10:53 -0500
Subject: [PATCH 2/4] updated locidex merge and unified classes between report
 and merge

---
 locidex/example/merge/merge_in/report.json  |  36 +++
 locidex/example/merge/merge_in/report1.json |  36 +++
 locidex/example/merge/merge_in/report2.json |  36 +++
 locidex/example/merge/merge_out/profile.tsv |   3 +
 locidex/example/merge/merge_out/run.json    |  24 ++
 locidex/merge.py                            | 250 ++++++++++++--------
 locidex/report.py                           | 101 ++++++--
 tests/test_merge.py                         |  23 ++
 8 files changed, 380 insertions(+), 129 deletions(-)
 create mode 100644 locidex/example/merge/merge_in/report.json
 create mode 100644 locidex/example/merge/merge_in/report1.json
 create mode 100644 locidex/example/merge/merge_in/report2.json
 create mode 100644 locidex/example/merge/merge_out/profile.tsv
 create mode 100644 locidex/example/merge/merge_out/run.json
 create mode 100644 tests/test_merge.py

diff --git a/locidex/example/merge/merge_in/report.json b/locidex/example/merge/merge_in/report.json
new file mode 100644
index 0000000..8030e0e
--- /dev/null
+++ b/locidex/example/merge/merge_in/report.json
@@ -0,0 +1,36 @@
+{
+    "db_info": {
+        "db_name": "Locidex Database",
+        "db_version": "1.0.0",
+        "db_date": "2024/30/04",
+        "db_author": "mw",
+        "db_desc": "test",
+        "db_num_seqs": 53,
+        "is_nucl": true,
+        "is_prot": true,
+        "nucleotide_db_name": "nucleotide",
+        "protein_db_name": "protein"
+    },
+    "parameters": {
+        "mode": "normal",
+        "min_match_ident": 100,
+        "min_match_cov": 100,
+        "max_ambiguous": 0,
+        "max_internal_stops": 0
+    },
+    "data": {
+        "sample_name": "1",
+        "profile": {
+            "1": {
+                "aroC": "9048803cd72dee3c868cd2dc5dc5650d",
+                "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d",
+                "hemD": "620f99723c4e190abe096b11ca34b944",
+                "hisD": "38027ac1ac34817584a176c7e575e97e",
+                "purE": "9855cbf4009439498bf84cacefce4d8f",
+                "sucA": "9289fc07cc8e93cfe0716e6f613cefdb",
+                "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d"
+            }
+        },
+        "seq_data": {}
+    }
+}
\ No newline at end of file
diff --git a/locidex/example/merge/merge_in/report1.json b/locidex/example/merge/merge_in/report1.json
new file mode 100644
index 0000000..f71297a
--- /dev/null
+++ b/locidex/example/merge/merge_in/report1.json
@@ -0,0 +1,36 @@
+{
+    "db_info": {
+        "db_name": "Locidex Database",
+        "db_version": "1.0.0",
+        "db_date": "2024/30/04",
+        "db_author": "mw",
+        "db_desc": "test",
+        "db_num_seqs": 53,
+        "is_nucl": true,
+        "is_prot": true,
+        "nucleotide_db_name": "nucleotide",
+        "protein_db_name": "protein"
+    },
+    "parameters": {
+        "mode": "normal",
+        "min_match_ident": 100,
+        "min_match_cov": 100,
+        "max_ambiguous": 0,
+        "max_internal_stops": 0
+    },
+    "data": {
+        "sample_name": "2",
+        "profile": {
+            "2": {
+                "aroC": "9048803cd72dee3c868cd2dc5dc5650d",
+                "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d",
+                "hemD": "620f99723c4e190abe096b11ca34b944",
+                "hisD": "38027ac1ac34817584a176c7e575e97e",
+                "purE": "9855cbf4009439498bf84cacefce4d8f",
+                "sucA": "9289fc07cc8e93cfe0716e6f613cefdb",
+                "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d"
+            }
+        },
+        "seq_data": {}
+    }
+}
\ No newline at end of file
diff --git a/locidex/example/merge/merge_in/report2.json b/locidex/example/merge/merge_in/report2.json
new file mode 100644
index 0000000..8030e0e
--- /dev/null
+++ b/locidex/example/merge/merge_in/report2.json
@@ -0,0 +1,36 @@
+{
+    "db_info": {
+        "db_name": "Locidex Database",
+        "db_version": "1.0.0",
+        "db_date": "2024/30/04",
+        "db_author": "mw",
+        "db_desc": "test",
+        "db_num_seqs": 53,
+        "is_nucl": true,
+        "is_prot": true,
+        "nucleotide_db_name": "nucleotide",
+        "protein_db_name": "protein"
+    },
+    "parameters": {
+        "mode": "normal",
+        "min_match_ident": 100,
+        "min_match_cov": 100,
+        "max_ambiguous": 0,
+        "max_internal_stops": 0
+    },
+    "data": {
+        "sample_name": "1",
+        "profile": {
+            "1": {
+                "aroC": "9048803cd72dee3c868cd2dc5dc5650d",
+                "dnaN": "2772ad8b8e0f7b50f1396c31fbe53f2d",
+                "hemD": "620f99723c4e190abe096b11ca34b944",
+                "hisD": "38027ac1ac34817584a176c7e575e97e",
+                "purE": "9855cbf4009439498bf84cacefce4d8f",
+                "sucA": "9289fc07cc8e93cfe0716e6f613cefdb",
+                "thrA": "9e1aa76bb42279ed7ec8fc30f984b65d"
+            }
+        },
+        "seq_data": {}
+    }
+}
\ No newline at end of file
diff --git a/locidex/example/merge/merge_out/profile.tsv b/locidex/example/merge/merge_out/profile.tsv
new file mode 100644
index 0000000..6044081
--- /dev/null
+++ b/locidex/example/merge/merge_out/profile.tsv
@@ -0,0 +1,3 @@
+sample_id	aroC	dnaN	hemD	hisD	purE	sucA	thrA
+1	9048803cd72dee3c868cd2dc5dc5650d	2772ad8b8e0f7b50f1396c31fbe53f2d	620f99723c4e190abe096b11ca34b944	38027ac1ac34817584a176c7e575e97e	9855cbf4009439498bf84cacefce4d8f	9289fc07cc8e93cfe0716e6f613cefdb	9e1aa76bb42279ed7ec8fc30f984b65d
+2	9048803cd72dee3c868cd2dc5dc5650d	2772ad8b8e0f7b50f1396c31fbe53f2d	620f99723c4e190abe096b11ca34b944	38027ac1ac34817584a176c7e575e97e	9855cbf4009439498bf84cacefce4d8f	9289fc07cc8e93cfe0716e6f613cefdb	9e1aa76bb42279ed7ec8fc30f984b65d
diff --git a/locidex/example/merge/merge_out/run.json b/locidex/example/merge/merge_out/run.json
new file mode 100644
index 0000000..e4f8448
--- /dev/null
+++ b/locidex/example/merge/merge_out/run.json
@@ -0,0 +1,24 @@
+{
+    "analysis_start_time": "13/05/2024 14:53:14",
+    "parameters": {
+        "command": "merge",
+        "input": [
+            [
+                "./locidex/example/merge/merge_in/report.json",
+                "./locidex/example/merge/merge_in/report1.json"
+            ]
+        ],
+        "outdir": "./locidex/example/merge/merge_out/",
+        "n_threads": 1,
+        "linker": "NNNNNNNNNNNNNNNNNNNN",
+        "strict": false,
+        "align": false,
+        "force": true
+    },
+    "result_file": "./locidex/example/merge/merge_out/profile.tsv",
+    "count_valid_loci": 0,
+    "count_invalid_loci": 0,
+    "valid_loci": "",
+    "invalid_loci": "",
+    "analysis_end_time": "13/05/2024 14:53:15"
+}
\ No newline at end of file
diff --git a/locidex/merge.py b/locidex/merge.py
index cb41386..5c79d3b 100644
--- a/locidex/merge.py
+++ b/locidex/merge.py
@@ -3,6 +3,7 @@
 import os
 import re
 import sys
+import errno
 from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter)
 from datetime import datetime
 from functools import partial
@@ -11,26 +12,31 @@
 import logging
 import pandas as pd
 from locidex.classes.aligner import align, parse_align
+from locidex.constants import DBConfig
+from locidex.report import ReportData, Data, Parameters
 from locidex.version import __version__
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(filemode=sys.stderr, level=logging.INFO)
 
 def add_args(parser=None):
+    """
+    TODO disabling alignment until test data is prepared
+    """
     if parser is None:
         parser = ArgumentParser(
             description="Locidex merge: Concatonate set of input profile.json files into  a tsv table or aligned fasta")
     parser.add_argument('-i','--input', type=str, required=True,help='Input file to report', action='append', nargs='+')
     parser.add_argument('-o', '--outdir', type=str, required=True, help='Output file to put results')
-    parser.add_argument('--n_threads','-t', type=int, required=False,
-                        help='CPU Threads to use', default=1)
-    parser.add_argument('--linker','-l', type=str, required=False,
-                        help='Linker sequence for alignment', default='NNNNNNNNNNNNNNNNNNNN')
+    #parser.add_argument('--n_threads','-t', type=int, required=False,
+    #                    help='CPU Threads to use', default=1)
+    #parser.add_argument('--linker','-l', type=str, required=False,
+    #                    help='Linker sequence for alignment', default='NNNNNNNNNNNNNNNNNNNN')
     parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__)
     parser.add_argument('-s', '--strict', required=False, help='Only merge data produces by the same db',
                         action='store_true')
-    parser.add_argument('-a', '--align', required=False, help='Perform alignment with individual loci to produce a concatenated alignment',
-                        action='store_true')
+    #parser.add_argument('-a', '--align', required=False, help='Perform alignment with individual loci to produce a concatenated alignment',
+    #                    action='store_true')
     parser.add_argument('-f', '--force', required=False, help='Overwrite existing directory',
                         action='store_true')
     return parser
@@ -46,7 +52,7 @@ def get_file_list(input_files):
         else:
             if not os.path.isfile(input_files[0]):
                 logger.critical(f'Error the supplied file {input_files[0]} does not exist')
-                sys.exit()
+                sys.exit(errno.ENOENT)
             encoding = guess_type(input_files[0])[1]
             _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
             with _open(input_files[0]) as f:
@@ -54,47 +60,80 @@ def get_file_list(input_files):
                     line = line.rstrip()
                     if not os.path.isfile(line):
                         logger.critical(f'Error the supplied file {line} does not exist')
-                        sys.exit()
+                        sys.exit(errno.ENOENT)
                     file_list.append(line)
     return file_list
 
+def validate_input_file(data_in: dict, db_version: str, db_name: str, perform_validation: bool) -> tuple[ReportData, str, str]:
+    """
+    Validate input data for usage verifying db_versions and db_names are the same
+    """
+
+    try:
+        sq_data = ReportData.deseriealize(data_in)
+    except KeyError:
+        logger.critical("Missing fields in configuration required fields in in reported allele file. Fields required: {}".format(ReportData.fields()))
+        sys.exit()
+    else:
+
+        if db_version is not None and sq_data.db_info.db_version != db_version and perform_validation:
+            logger.critical("You are attempting to merge files that were created using different database versions.")
+            sys.exit()
+        
+        if db_name is not None and sq_data.db_info.db_name != db_name and perform_validation:
+            logger.critical("You are attempting to merge files that have different names.")
+            sys.exit()
+    
+    return sq_data, sq_data.db_info.db_version, sq_data.db_info.db_name
+
+def check_files_exist(file_list: list[os.PathLike]) -> None:
+    """
+    Verify that all files to be analyzed exist
+    """
+    for file in file_list:
+        if not os.path.isfile(file):
+            logger.critical(f"Error cannot open input file {file}")
+            sys.exit(errno.ENOENT)
+
+
 def read_file_list(file_list,perform_validation=False):
     records = {}
-    db_info = {}
+    db_version = None
+    db_name = None
+
+    check_files_exist(file_list)
+
     for f in file_list:
-        if not os.path.isfile(f):
-            logger.critical(f"Error cannot open input file {f}")
-            sys.exit()
         encoding = guess_type(f)[1]
         _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open
         with _open(f) as fh:
             data = json.load(fh)
-            if 'db_info' not in data:
-                print("Error malformed or invalid input file {}, missing 'db_info'".format(f))
-                sys.exit()
-            if len(db_info) == 0:
-                db_info = data['db_info']
-            if db_info["db_name"] != data['db_info']["db_name"] or \
-                db_info["db_version"] != data['db_info']["db_version"]:
-                print("Error you are attempting to merge files generated with different databases {} vs. {}: {}".format(db_info,f,data['db_info']))
-                sys.exit()
-            records[data['data']['sample_name']] = data
+            sq_data, db_version, db_name = validate_input_file(data, 
+                                                            db_version=db_version, 
+                                                            db_name=db_name, 
+                                                            perform_validation=perform_validation)     
+
+            sample_name = sq_data.data.sample_name
+            if records.get(sq_data.data.sample_name) is None:
+                records[sample_name] = sq_data
+            else:
+                logger.critical("Duplicate sample name detected: {}".format(sq_data.data.sample_name))
+                sys.exit("Attempting to merge allele profiles with the same sample name: {}".format(sq_data.data.sample_name))
     return records
 
 def extract_profiles(records):
     profile = {}
     for id in records:
-        for sample_name in records[id]['data']['profile']:
-            profile[sample_name] = records[id]['data']['profile'][sample_name]
+        for sample_name in records[id].data.profile:
+            if profile.get(sample_name) is not None:
+                logger.critical("Sample {} already exists and will not be added.")
+            profile[sample_name] = records[id].data.profile[sample_name]
     return profile
 
 def extract_seqs(records):
     seqs = {}
     for id in records:
-        if not 'seq_data' in records[id]['data']:
-            continue
-        
-        seqs[id] = records[id]['data']['seq_data']
+        seqs[id] = records[id].data.seq_data
     return seqs
 
 def write_gene_fastas(seq_data,work_dir):
@@ -134,9 +173,12 @@ def run_merge(config):
     #Input Parameters
     input_files = config['input'][0]
     outdir = config['outdir']
-    perform_align = config['align']
-    linker_seq = config['linker']
-    n_threads = config['n_threads']
+    ###
+    # Commented out as these changes will require test data
+    # perform_align = config['align']
+    # linker_seq = config['linker']
+    # n_threads = config['n_threads']
+    ###
     force = config['force']
     validate_db = config['strict']
     if validate_db is None or validate_db == '':
@@ -148,8 +190,8 @@ def run_merge(config):
     run_data['parameters'] = analysis_parameters
 
     if os.path.isdir(outdir) and not force:
-        print(f'Error {outdir} exists, if you would like to overwrite, then specify --force')
-        sys.exit()
+        logger.critical(f'Error {outdir} exists, if you would like to overwrite, then specify --force')
+        sys.exit(errno.EEXIST)
 
     if not os.path.isdir(outdir):
         os.makedirs(outdir, 0o755)
@@ -160,80 +202,82 @@ def run_merge(config):
 
     #create profile
     df = pd.DataFrame.from_dict(extract_profiles(records), orient='index')
-    df.insert(loc=0,
-              column='sample_id',
-              value=df.index.tolist())
+    df.insert(loc=0, column='sample_id', value=df.index.tolist())
     df.to_csv(os.path.join(outdir,'profile.tsv'),index=False,header=True,sep="\t")
-    sample_names = list(df['sample_id'])
+    
     del(df)
     run_data['result_file'] = os.path.join(outdir,"profile.tsv")
-
+    
     #create alignment
-    if perform_align and len(records) > 1:
-        pass
-        work_dir = os.path.join(outdir,"raw_gene_fastas")
-        if not os.path.isdir(work_dir):
-            os.makedirs(work_dir, 0o755)
-        
-        seq_data = extract_seqs(records)
-        gene_files = write_gene_fastas(seq_data,work_dir)
-        del(records)
-        del(seq_data)
-        pool = Pool(processes=n_threads)
-
-        results = []
-        for locus_name in gene_files:
-            results.append(pool.apply_async(align, args=((gene_files[locus_name]['file'],))))
-
-        pool.close()
-        pool.join()
-
-        r = []
-        for x in results:
-            if isinstance(x, dict):
-                r.append(x)
-            else:
-                r.append(x.get())
-        results = r
-        loci_names = list(gene_files.keys())
-        alignment = {}
-        
-
-        for i in range(0,len(results)):
-            alignment[loci_names[i]] = parse_align(results[i][0])
-            results[i] = None
-        del(results)
-
-        loci_lengths = {}
-        for sample_id in sample_names:
-            for locus_name in loci_names:
-                if sample_id not in alignment[locus_name]:
-                    continue
-                loci_lengths[locus_name] = len(alignment[locus_name][sample_id])
-
-
-        out_align = os.path.join(outdir,'loci_alignment.fas')
-        oh = open(out_align,'w')
-        invalid_loci = set()
-        for sample_id in sample_names:
-            seq = []
-            for locus_name in loci_names:
-                if locus_name not in loci_lengths:
-                    invalid_loci.add(locus_name)
-                    continue
-                if sample_id in alignment[locus_name]:
-                    seq.append(alignment[locus_name][sample_id])
-                else:
-                    seq.append(''.join(['-']*loci_lengths[locus_name]))
-                seq.append(linker_seq)
-            oh.write('>{}\n{}\n'.format(sample_id,"".join(seq)))
-        oh.close()
-        run_data['alignment_file'] = out_align
-
-    run_data['count_valid_loci'] = len(loci_lengths.keys())
-    run_data['count_invalid_loci'] = len(list(invalid_loci))
-    run_data['valid_loci'] = ",".join(list(loci_lengths.keys()))
-    run_data['invalid_loci'] = ",".join(list(invalid_loci))
+    # TODO  Bring this back in when test data is provided
+    #sample_names = list(df['sample_id'])
+    #loci_lengths = {}
+    #invalid_loci = set()
+    #if perform_align and len(records) > 1:
+    #    pass
+    #    work_dir = os.path.join(outdir,"raw_gene_fastas")
+    #    if not os.path.isdir(work_dir):
+    #        os.makedirs(work_dir, 0o755)
+    #    
+    #    seq_data = extract_seqs(records)
+    #    gene_files = write_gene_fastas(seq_data,work_dir)
+    #    del(records)
+    #    del(seq_data)
+    #    pool = Pool(processes=n_threads)
+
+    #    results = []
+    #    for locus_name in gene_files:
+    #        results.append(pool.apply_async(align, args=((gene_files[locus_name]['file'],))))
+
+    #    pool.close()
+    #    pool.join()
+
+    #    r = []
+    #    for x in results:
+    #        if isinstance(x, dict):
+    #            r.append(x)
+    #        else:
+    #            r.append(x.get())
+    #    results = r
+    #    loci_names = list(gene_files.keys())
+    #    alignment = {}
+    #    
+
+    #    for i in range(0,len(results)):
+    #        alignment[loci_names[i]] = parse_align(results[i][0])
+    #        results[i] = None
+    #    del(results)
+
+    #    
+    #    for sample_id in sample_names:
+    #        for locus_name in loci_names:
+    #            if sample_id not in alignment[locus_name]:
+    #                continue
+    #            loci_lengths[locus_name] = len(alignment[locus_name][sample_id])
+
+
+    #    out_align = os.path.join(outdir,'loci_alignment.fas')
+    #    oh = open(out_align,'w')
+    #    
+    #    for sample_id in sample_names:
+    #        seq = []
+    #        for locus_name in loci_names:
+    #            if locus_name not in loci_lengths:
+    #                invalid_loci.add(locus_name)
+    #                continue
+    #            if sample_id in alignment[locus_name]:
+    #                seq.append(alignment[locus_name][sample_id])
+    #            else:
+    #                seq.append(''.join(['-']*loci_lengths[locus_name]))
+    #            seq.append(linker_seq)
+    #        oh.write('>{}\n{}\n'.format(sample_id,"".join(seq)))
+    #    oh.close()
+    #    run_data['alignment_file'] = out_align
+
+    #run_data['count_valid_loci'] = len(loci_lengths.keys())
+    #run_data['count_invalid_loci'] = len(list(invalid_loci))
+    #run_data['valid_loci'] = ",".join(list(loci_lengths.keys()))
+    #run_data['invalid_loci'] = ",".join(list(invalid_loci))
     run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
     with open(os.path.join(outdir,"run.json"),'w' ) as fh:
         fh.write(json.dumps(run_data, indent=4))
diff --git a/locidex/report.py b/locidex/report.py
index ac4469b..5be3f32 100644
--- a/locidex/report.py
+++ b/locidex/report.py
@@ -4,14 +4,63 @@
 from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter)
 from copy import deepcopy
 from datetime import datetime
-
+from dataclasses import dataclass, asdict, fields
 import pandas as pd
+from typing import Any
 from locidex.classes.seq_intake import seq_intake
-from locidex.constants import SEARCH_RUN_DATA, START_CODONS, STOP_CODONS
+from locidex.constants import SEARCH_RUN_DATA, START_CODONS, STOP_CODONS, DBConfig
 from locidex.utils import calc_md5
 from locidex.version import __version__
 
 
+
+@dataclass
+class Parameters:
+    mode: str 
+    min_match_ident: str
+    min_match_cov: str 
+    max_ambiguous: str 
+    max_internal_stops: str
+
+@dataclass
+class Data:
+    sample_name: str
+    profile: dict
+    seq_data: dict
+
+    def __getitem__(self, name: str) -> Any:
+        return getattr(self, str(name))
+    
+    def __setitem__(self, key: str, value: str) -> None:
+        setattr(self, key, value)
+
+@dataclass
+class ReportData:
+    db_info: DBConfig
+    parameters: Parameters
+    data: Data
+
+    def __getitem__(self, name: str) -> Any:
+        return getattr(self, str(name))
+    
+    def __setitem__(self, key: str, value: str) -> None:
+        setattr(self, key, value)
+
+    @classmethod
+    def fields(cls):
+        return fields(cls)
+
+    @classmethod
+    def deseriealize(cls, input: dict):
+        """
+        Return a ReportData object from deserialized json data
+        """
+        return cls(db_info=DBConfig(**input["db_info"]), 
+            parameters=Parameters(**input["parameters"]), 
+            data=Data(**input["data"]))
+
+
+
 def add_args(parser=None):
 
     if parser is None:
@@ -385,41 +434,42 @@ def run_report(config):
     allele_obj.extract_hit_data('protein').to_csv(os.path.join(outdir, "protein.hits.txt"), header=True, sep="\t", index=False)
 
 
-    profile = {
-        "db_info":seq_store_dict["db_info"],
-        'parameters':{
-            'mode':mode,
-            'min_match_ident':match_ident,
-            'min_match_cov':match_cov,
-            'max_ambiguous':max_ambig,
-            'max_internal_stops':max_int_stop
-        },
-        'data':{
-            'sample_name':sample_name,
-            'profile':{sample_name: allele_obj.profile},
-            'seq_data':seq_data
-        }
-    }
+    profile = ReportData(
+        db_info=DBConfig(**seq_store_dict["db_info"]),
+        parameters= Parameters(
+            mode=mode,
+            min_match_ident=match_ident,
+            min_match_cov=match_cov,
+            max_ambiguous=max_ambig,
+            max_internal_stops=max_int_stop
+        ),
+        data = Data(
+            sample_name = sample_name,
+            profile = {sample_name: allele_obj.profile},
+            seq_data=seq_data
+        )
+    )
+
     
-    if len(profile['data']['seq_data']) > 0:
+    if len(profile.data.seq_data) > 0:
         # add locus information to seq_data
         look_up = {}
-        for locus_name in profile['data']['profile'][sample_name]:
-            h = profile['data']['profile'][sample_name][locus_name]
+        for locus_name in profile.data.profile[sample_name]:
+            h = profile.data.profile[sample_name][locus_name]
             if h not in look_up:
                 look_up[h] = []
             look_up[h].append(locus_name)
         
-        for seq_id in profile['data']['seq_data']:
-            h = profile['data']['seq_data'][seq_id]['dna_hash']
+        for seq_id in profile.data.seq_data:
+            h = profile.data.seq_data[seq_id]['dna_hash']
             if h in look_up:
-                profile['data']['seq_data'][seq_id]['locus_name'] = ",".join([str(x) for x in look_up[h]])
+                profile.data.seq_data[seq_id]['locus_name'] = ",".join([str(x) for x in look_up[h]])
             else:
-                profile['data']['seq_data'][seq_id]['locus_name'] = ''
+                profile.data.seq_data[seq_id]['locus_name'] = ''
 
 
     with open(os.path.join(outdir,"report.json"),"w") as out:
-        json.dump(profile,out,indent=4)
+        json.dump(profile,out,indent=4, default=lambda o: o.__dict__)
 
     run_data['result_file'] = os.path.join(outdir,"report.json")
     run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
@@ -428,7 +478,6 @@ def run_report(config):
 
 
 def run(cmd_args=None):
-    #cmd_args = parse_args()
     if cmd_args is None:
         parser = add_args()
         cmd_args = parser.parse_args()
diff --git a/tests/test_merge.py b/tests/test_merge.py
new file mode 100644
index 0000000..593247b
--- /dev/null
+++ b/tests/test_merge.py
@@ -0,0 +1,23 @@
+import pytest
+from locidex import merge
+
+
+DUPLICATE_NAMES = [
+    "locidex/example/merge/merge_in/report.json", 
+    "locidex/example/merge/merge_in/report2.json", 
+]
+
+MERGE_SUCCESSFULLY = [
+    "locidex/example/merge/merge_in/report.json", 
+    "locidex/example/merge/merge_in/report1.json", 
+]
+
+def test_throws_duplicate_error():
+    """
+    """
+    with pytest.raises(SystemExit, match="Attempting to merge allele profiles with the same sample name: 1"):
+        merge.read_file_list(DUPLICATE_NAMES, perform_validation=True)
+
+def test_merge_profiles_no_error():
+    merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True)
+    
\ No newline at end of file

From 56b1d288369433b9716aada288d6e146ca0e5ea6 Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 13 May 2024 16:06:41 -0500
Subject: [PATCH 3/4] updated merge tests

---
 .gitignore          |  3 ++-
 locidex/merge.py    |  5 +++--
 tests/test_merge.py | 17 ++++++++++++++++-
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index ea3ec62..91b0132 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@ __pycache__
 *.egg*
 .vscode
 .pytest_cache
-tmp
\ No newline at end of file
+tmp
+.coverage*
\ No newline at end of file
diff --git a/locidex/merge.py b/locidex/merge.py
index 5c79d3b..526de67 100644
--- a/locidex/merge.py
+++ b/locidex/merge.py
@@ -208,8 +208,9 @@ def run_merge(config):
     del(df)
     run_data['result_file'] = os.path.join(outdir,"profile.tsv")
     
-    #create alignment
-    # TODO  Bring this back in when test data is provided
+    ######### create alignment ###############
+    # Bring this back in when test data is provided
+    ############################################
     #sample_names = list(df['sample_id'])
     #loci_lengths = {}
     #invalid_loci = set()
diff --git a/tests/test_merge.py b/tests/test_merge.py
index 593247b..27e3b16 100644
--- a/tests/test_merge.py
+++ b/tests/test_merge.py
@@ -1,5 +1,6 @@
 import pytest
 from locidex import merge
+import errno
 
 
 DUPLICATE_NAMES = [
@@ -20,4 +21,18 @@ def test_throws_duplicate_error():
 
 def test_merge_profiles_no_error():
     merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True)
-    
\ No newline at end of file
+    
+
+def test_check_files_exist():
+    with pytest.raises(SystemExit, match=str(errno.ENOENT)):
+        fail = "dne.txt"
+        merge.check_files_exist([fail])
+    
+    merge.check_files_exist(MERGE_SUCCESSFULLY)
+
+def test_extract_profiles():
+    records = merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True)
+    extracted_profiles = merge.extract_profiles(records)
+    assert len(extracted_profiles) == 2
+    key1, key2 = extracted_profiles.keys()
+    assert extracted_profiles[key1] == extracted_profiles[key2]
\ No newline at end of file

From b1045064d06aa3e2e15f4d5916521101824cac8e Mon Sep 17 00:00:00 2001
From: Matthew Wells <mattwells9@shaw.ca>
Date: Mon, 13 May 2024 16:12:05 -0500
Subject: [PATCH 4/4] added more merge tests

---
 tests/test_merge.py      | 5 +++--
 tests/test_workflows.yml | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/test_merge.py b/tests/test_merge.py
index 27e3b16..7fe1066 100644
--- a/tests/test_merge.py
+++ b/tests/test_merge.py
@@ -34,5 +34,6 @@ def test_extract_profiles():
     records = merge.read_file_list(MERGE_SUCCESSFULLY, perform_validation=True)
     extracted_profiles = merge.extract_profiles(records)
     assert len(extracted_profiles) == 2
-    key1, key2 = extracted_profiles.keys()
-    assert extracted_profiles[key1] == extracted_profiles[key2]
\ No newline at end of file
+    value1, value2 = extracted_profiles.values()
+    assert value1 == value2
+    assert value1 == {'aroC': '9048803cd72dee3c868cd2dc5dc5650d', 'dnaN': '2772ad8b8e0f7b50f1396c31fbe53f2d', 'hemD': '620f99723c4e190abe096b11ca34b944', 'hisD': '38027ac1ac34817584a176c7e575e97e', 'purE': '9855cbf4009439498bf84cacefce4d8f', 'sucA': '9289fc07cc8e93cfe0716e6f613cefdb', 'thrA': '9e1aa76bb42279ed7ec8fc30f984b65d'}
\ No newline at end of file
diff --git a/tests/test_workflows.yml b/tests/test_workflows.yml
index 281e420..57bc82a 100644
--- a/tests/test_workflows.yml
+++ b/tests/test_workflows.yml
@@ -38,4 +38,10 @@
   command: locidex format -i locidex/example/format_db_mlst_in -o format_out
   files:
     - path: format_out/locidex.txt
-    - path: format_out/results.json
\ No newline at end of file
+    - path: format_out/results.json
+  
+- name: Run Merge
+  command: locidex merge -i locidex/example/merge/merge_in/report.json locidex/example/merge/merge_in/report1.json -o merged_out/
+  files:
+    - path: merged_out/profile.tsv
+      md5sum: 4cb7a14f4b4e7d0ef4992d477d773275
\ No newline at end of file