Skip to content

Commit c9b0d94

Browse files
committed
Swapped csv for pandas for output MLST report csv
1 parent 445a008 commit c9b0d94

File tree

1 file changed

+25
-21
lines changed

1 file changed

+25
-21
lines changed

locidex/merge.py

+25-21
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ def read_file_list(file_list, perform_validation=False, key_sample_name=None):
105105
db_version = None
106106
db_name = None
107107
check_files_exist(file_list)
108-
error_reports = []
108+
if key_sample_name:
109+
modified_MLST_files = [["sample", "JSON_key", "error_message"]]
110+
else:
111+
modified_MLST_files = None
109112

110113
for f in file_list:
111114
if key_sample_name:
@@ -115,9 +118,9 @@ def read_file_list(file_list, perform_validation=False, key_sample_name=None):
115118
with _open(f) as fh:
116119
data = json.load(fh)
117120
if key_sample_name:
118-
data, compare_errmsg = compare_profiles(data,alt_profile, os.path.basename(f))
119-
if compare_errmsg:
120-
error_reports.append(compare_errmsg)
121+
data, mlst_report = compare_profiles(data,alt_profile, os.path.basename(f))
122+
if mlst_report:
123+
modified_MLST_files.append(mlst_report)
121124
sq_data, db_version, db_name = validate_input_file(data,
122125
db_version=db_version,
123126
db_name=db_name,
@@ -130,7 +133,7 @@ def read_file_list(file_list, perform_validation=False, key_sample_name=None):
130133
logger.critical("Duplicate sample name detected: {}".format(sq_data.data.sample_name))
131134
raise ValueError("Attempting to merge allele profiles with the same sample name: {}".format(sq_data.data.sample_name))
132135

133-
return records, error_reports
136+
return records, modified_MLST_files
134137

135138
def extract_profiles(records):
136139
profile = {}
@@ -200,31 +203,36 @@ def compare_profiles(mlst, sample_id, file_name):
200203
# Define a variable to store the match_status (True or False)
201204
match_status = sample_id in profile
202205
# Initialize the error message
203-
error_message = None
206+
MLST_message = None
204207

205208
if not keys:
206209
logger.critical(f"{file_name} is missing the 'profile' section or is completely empty!")
207210
raise ValueError(f"{file_name} is missing the 'profile' section or is completely empty!")
208-
sys.exit(1)
211+
209212
elif len(keys) > 1:
210213
# Check if sample_id matches any key
211214
if not match_status:
212-
error_message = f"No key in the MLST JSON file ({file_name}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
215+
MLST_message = f"No key in the MLST JSON file ({file_name}) matches the specified sample ID '{sample_id}'. The first key '{original_key}' has been forcefully changed to '{sample_id}' and all other keys have been removed."
213216
# Retain only the specified sample ID
214217
mlst["data"]["profile"] = {sample_id: profile.pop(original_key)}
215218
else:
216-
error_message = f"MLST JSON file ({file_name}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
219+
MLST_message = f"MLST JSON file ({file_name}) contains multiple keys: {keys}. The MLST JSON file has been modified to retain only the '{sample_id}' entry"
217220
# Retain only the specified sample_id in the profile
218221
mlst["data"]["profile"] = {sample_id: profile[sample_id]}
219222
elif not match_status:
220-
error_message = f"{sample_id} ID and JSON key in {file_name} DO NOT MATCH. The '{original_key}' key in {file_name} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
223+
MLST_message = f"{sample_id} ID and JSON key in {file_name} DO NOT MATCH. The '{original_key}' key in {file_name} has been forcefully changed to '{sample_id}': User should manually check input files to ensure correctness."
221224
# Update the JSON file with the new sample ID
222225
mlst["data"]["profile"] = {sample_id: profile.pop(original_key)}
223226
mlst["data"]["sample_name"] = sample_id
224227

225-
error_report = [sample_id, keys, error_message]
228+
# Create a report for all the samples that have their profiles modified in the output profile.tsv
229+
if MLST_message:
230+
mlst_report = [sample_id, keys, MLST_message]
231+
else:
232+
mlst_report = None
233+
226234
# Write the updated JSON data back to the original file
227-
return mlst, error_report
235+
return mlst, mlst_report
228236

229237
def run_merge(config):
230238
analysis_parameters = config
@@ -264,7 +272,7 @@ def run_merge(config):
264272

265273
#perform merge
266274
file_list = get_file_list(input_files)
267-
records, compare_error = read_file_list(file_list,perform_validation=validate_db, key_sample_name=sample_dict)
275+
records, modified_MLST_file_list = read_file_list(file_list,perform_validation=validate_db, key_sample_name=sample_dict)
268276

269277
#create profile
270278
df = pd.DataFrame.from_dict(extract_profiles(records), orient='index')
@@ -274,14 +282,10 @@ def run_merge(config):
274282
del(df)
275283
run_data['result_file'] = os.path.join(outdir,"profile.tsv")
276284

277-
#Write error messages for profile mismatch (compare_profiles())
278-
for error_message in compare_error:
279-
if error_message[2]:
280-
output_error_file = outdir + "/" + error_message[0] + "_error_report.csv"
281-
with open(output_error_file, "w", newline="") as f:
282-
writer = csv.writer(f)
283-
writer.writerow(["sample", "JSON_key", "error_message"])
284-
writer.writerow([error_message[0], error_message[1], error_message[2]])
285+
#Write report of all the MLST files with profile mismatch and how MLST profiles with mismatch were modified
286+
df = pd.DataFrame(modified_MLST_file_list)
287+
df.to_csv(f'{outdir}/MLST_error_report.csv', index=False, header=False)
288+
285289

286290
run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
287291
with open(os.path.join(outdir,"run.json"),'w' ) as fh:

0 commit comments

Comments
 (0)