From 84992babeb623c7a1855404474c07fe7dde5a291 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 17 May 2022 10:35:25 -0700 Subject: [PATCH] filter: Use print_err and AugurError This also simplifies the implementation of validate_arguments() to raise AugurErrors directly instead of returning a boolean to be translated to a proper error message by the caller. --- augur/filter.py | 75 ++++++------------- ...ilter-min-length-no-sequence-index-error.t | 2 +- .../cram/filter-mismatched-sequences-error.t | 6 +- .../filter/cram/filter-no-outputs-error.t | 2 +- .../filter-output-strains-no-sequence-error.t | 2 +- ...equences-no-probabilistic-sampling-error.t | 2 +- .../subsample-no-sequences-quantity-error.t | 2 +- tests/test_filter.py | 7 +- 8 files changed, 35 insertions(+), 63 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index ea7618256..ed749a202 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -13,14 +13,13 @@ import pandas as pd import random import re -import sys from tempfile import NamedTemporaryFile from typing import Collection from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates from .errors import AugurError from .index import index_sequences, index_vcf -from .io import open_file, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf +from .io import open_file, print_err, read_metadata, read_sequences, write_sequences, is_vcf as filename_is_vcf, write_vcf from .utils import read_strains comment_char = '#' @@ -48,8 +47,7 @@ def constant_factory(value): for elems in (line.strip().split('\t') if '\t' in line else line.strip().split() for line in pfile.readlines()) }) except Exception as e: - print(f"ERROR: missing or malformed priority scores file {fname}", file=sys.stderr) - raise e + raise AugurError(f"ERROR: missing or malformed priority scores file {fname}") # Define metadata filters. @@ -634,7 +632,7 @@ def construct_filters(args, sequence_index): is_vcf = filename_is_vcf(args.sequences) if is_vcf: #doesn't make sense for VCF, ignore. - print("WARNING: Cannot use min_length for VCF files. Ignoring...", file=sys.stderr) + print_err("WARNING: Cannot use min_length for VCF files. Ignoring...") else: exclude_by.append(( filter_by_sequence_length, @@ -925,8 +923,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): if 'year' in group_by_set or 'month' in group_by_set: if 'date' not in metadata: # set year/month/day = unknown - print(f"WARNING: A 'date' column could not be found to group-by year or month.", file=sys.stderr) - print(f"Filtering by group may behave differently than expected!", file=sys.stderr) + print_err(f"WARNING: A 'date' column could not be found to group-by year or month.") + print_err(f"Filtering by group may behave differently than expected!") df_dates = pd.DataFrame({'year': 'unknown', 'month': 'unknown'}, index=metadata.index) metadata = pd.concat([metadata, df_dates], axis=1) else: @@ -966,8 +964,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): unknown_groups = group_by_set - set(metadata.columns) if unknown_groups: - print(f"WARNING: Some of the specified group-by categories couldn't be found: {', '.join(unknown_groups)}", file=sys.stderr) - print("Filtering by group may behave differently than expected!", file=sys.stderr) + print_err(f"WARNING: Some of the specified group-by categories couldn't be found: {', '.join(unknown_groups)}") + print_err("Filtering by group may behave differently than expected!") for group in unknown_groups: metadata[group] = 'unknown' @@ -1164,41 +1162,25 @@ def register_arguments(parser): def validate_arguments(args): - """Validate arguments and return a boolean representing whether all validation - rules succeeded. + """Validate arguments. Parameters ---------- args : argparse.Namespace Parsed arguments from argparse - - Returns - ------- - bool : - Validation succeeded. - """ # Don't allow sequence output when no sequence input is provided. if args.output and not args.sequences: - print( - "ERROR: You need to provide sequences to output sequences.", - file=sys.stderr) - return False + raise AugurError("You need to provide sequences to output sequences.") # Confirm that at least one output was requested. if not any((args.output, args.output_metadata, args.output_strains)): - print( - "ERROR: You need to select at least one output.", - file=sys.stderr) - return False + raise AugurError("You need to select at least one output.") # Don't allow filtering on sequence-based information, if no sequences or # sequence index is provided. if not args.sequences and not args.sequence_index and any(getattr(args, arg) for arg in SEQUENCE_ONLY_FILTERS): - print( - "ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.", - file=sys.stderr) - return False + raise AugurError("You need to provide a sequence index or sequences to filter on sequence-specific information.") # Set flags if VCF is_vcf = filename_is_vcf(args.sequences) @@ -1207,20 +1189,12 @@ def validate_arguments(args): if is_vcf: from shutil import which if which("vcftools") is None: - print("ERROR: 'vcftools' is not installed! This is required for VCF data. " - "Please see the augur install instructions to install it.", - file=sys.stderr) - return False + raise AugurError("'vcftools' is not installed! This is required for VCF data. " + "Please see the augur install instructions to install it.") # If user requested grouping, confirm that other required inputs are provided, too. if args.group_by and not any((args.sequences_per_group, args.subsample_max_sequences)): - print( - "ERROR: You must specify a number of sequences per group or maximum sequences to subsample.", - file=sys.stderr - ) - return False - - return True + raise AugurError("You must specify a number of sequences per group or maximum sequences to subsample.") def run(args): @@ -1228,8 +1202,7 @@ def run(args): filter and subsample a set of sequences into an analysis set ''' # Validate arguments before attempting any I/O. - if not validate_arguments(args): - return 1 + validate_arguments(args) # Determine whether the sequence index exists or whether should be # generated. We need to generate an index if the input sequences are in a @@ -1252,10 +1225,9 @@ def run(args): with NamedTemporaryFile(delete=False) as sequence_index_file: sequence_index_path = sequence_index_file.name - print( + print_err( "Note: You did not provide a sequence index, so Augur will generate one.", - "You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.", - file=sys.stderr + "You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`." ) if is_vcf: @@ -1494,8 +1466,7 @@ def run(args): args.probabilistic_sampling, ) except TooManyGroupsError as error: - print(f"ERROR: {error}", file=sys.stderr) - sys.exit(1) + raise AugurError(error) if (probabilistic_used): print(f"Sampling probabilistically at {sequences_per_group:0.4f} sequences per group, meaning it is possible to have more than the requested maximum of {args.subsample_max_sequences} sequences after filtering.") @@ -1620,10 +1591,9 @@ def run(args): # Warn the user if the expected strains from the sequence index are # not a superset of the observed strains. if sequence_strains is not None and observed_sequence_strains > sequence_strains: - print( + print_err( "WARNING: The sequence index is out of sync with the provided sequences.", - "Metadata and strain output may not match sequence output.", - file=sys.stderr + "Metadata and strain output may not match sequence output." ) # Update the set of available sequence strains. @@ -1682,8 +1652,7 @@ def run(args): print("\t%i of these were dropped because of subsampling criteria%s" % (num_excluded_subsamp, seed_txt)) if total_strains_passed == 0: - print("ERROR: All samples have been dropped! Check filter rules and metadata file format.", file=sys.stderr) - return 1 + raise AugurError("All samples have been dropped! Check filter rules and metadata file format.") print(f"{total_strains_passed} strains passed all filters") @@ -1729,7 +1698,7 @@ def calculate_sequences_per_group(target_max_value, counts_per_group, allow_prob ) except TooManyGroupsError as error: if allow_probabilistic: - print(f"WARNING: {error}", file=sys.stderr) + print_err(f"WARNING: {error}") sequences_per_group = _calculate_fractional_sequences_per_group( target_max_value, counts_per_group, diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t index b2a680c40..d360c8a54 100644 --- a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t +++ b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t @@ -11,4 +11,4 @@ This should fail because the requested filters rely on sequence information. > --min-length 10000 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information. - [1] + [2] diff --git a/tests/functional/filter/cram/filter-mismatched-sequences-error.t b/tests/functional/filter/cram/filter-mismatched-sequences-error.t index 4e3ff7093..14da5e054 100644 --- a/tests/functional/filter/cram/filter-mismatched-sequences-error.t +++ b/tests/functional/filter/cram/filter-mismatched-sequences-error.t @@ -15,7 +15,7 @@ This should produce no results because the intersection of metadata and sequence > --output-strains "$TMP/filtered_strains.txt" > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. ERROR: All samples have been dropped! Check filter rules and metadata file format. - [1] + [2] $ wc -l "$TMP/filtered_strains.txt" \s*0 .* (re) $ rm -f "$TMP/filtered_strains.txt" @@ -30,7 +30,7 @@ Repeat with sequence and strain outputs. We should get the same results. > --output-sequences "$TMP/filtered.fasta" > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. ERROR: All samples have been dropped! Check filter rules and metadata file format. - [1] + [2] $ wc -l "$TMP/filtered_strains.txt" \s*0 .* (re) $ grep "^>" "$TMP/filtered.fasta" | wc -l @@ -47,7 +47,7 @@ Since we expect metadata to be filtered by presence of strains in input sequence > --output-strains "$TMP/filtered_strains.txt" > /dev/null Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`. ERROR: All samples have been dropped! Check filter rules and metadata file format. - [1] + [2] $ wc -l "$TMP/filtered_strains.txt" \s*0 .* (re) $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/cram/filter-no-outputs-error.t b/tests/functional/filter/cram/filter-no-outputs-error.t index ce44af5a3..7eedec3e1 100644 --- a/tests/functional/filter/cram/filter-no-outputs-error.t +++ b/tests/functional/filter/cram/filter-no-outputs-error.t @@ -10,4 +10,4 @@ Try to filter without any outputs. > --metadata filter/data/metadata.tsv \ > --min-length 10000 > /dev/null ERROR: You need to select at least one output. - [1] + [2] diff --git a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t index 6556da1b0..2648c18b4 100644 --- a/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t +++ b/tests/functional/filter/cram/filter-output-strains-no-sequence-error.t @@ -12,4 +12,4 @@ This should fail. > --min-length 10000 \ > --output "$TMP/filtered.fasta" > /dev/null ERROR: You need to provide sequences to output sequences. - [1] + [2] diff --git a/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t b/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t index e52ed67fd..8e9332d7e 100644 --- a/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t +++ b/tests/functional/filter/cram/subsample-max-sequences-no-probabilistic-sampling-error.t @@ -16,5 +16,5 @@ This should fail, as probabilistic sampling is explicitly disabled. > --no-probabilistic-sampling \ > --output "$TMP/filtered.fasta" ERROR: Asked to provide at most 5 sequences, but there are 8 groups. - [1] + [2] $ rm -f "$TMP/filtered.fasta" diff --git a/tests/functional/filter/cram/subsample-no-sequences-quantity-error.t b/tests/functional/filter/cram/subsample-no-sequences-quantity-error.t index 093d91794..910ebc036 100644 --- a/tests/functional/filter/cram/subsample-no-sequences-quantity-error.t +++ b/tests/functional/filter/cram/subsample-no-sequences-quantity-error.t @@ -11,4 +11,4 @@ This should fail with a helpful error message. > --group-by year month \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null ERROR: You must specify a number of sequences per group or maximum sequences to subsample. - [1] + [2] diff --git a/tests/test_filter.py b/tests/test_filter.py index 9ad593c03..328c0c6b0 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -14,6 +14,7 @@ import augur.filter from augur.io import read_metadata +from augur.utils import AugurError @pytest.fixture def argparser(): @@ -75,9 +76,10 @@ def test_read_priority_scores_valid(self, mock_priorities_file_valid): assert priorities["strain42"] == -np.inf, "Default priority is negative infinity for unlisted sequences" def test_read_priority_scores_malformed(self, mock_priorities_file_malformed): - with pytest.raises(ValueError): + with pytest.raises(AugurError) as e_info: # builtins.open is stubbed, but we need a valid file to satisfy the existence check augur.filter.read_priority_scores("tests/builds/tb/data/lee_2015.vcf") + assert str(e_info.value) == "ERROR: missing or malformed priority scores file tests/builds/tb/data/lee_2015.vcf" def test_read_priority_scores_valid_with_spaces_and_tabs(self, mock_priorities_file_valid_with_spaces_and_tabs): # builtins.open is stubbed, but we need a valid file to satisfy the existence check @@ -88,8 +90,9 @@ def test_read_priority_scores_valid_with_spaces_and_tabs(self, mock_priorities_f assert priorities == {"strain 1": 5, "strain 2": 6, "strain 3": 8} def test_read_priority_scores_does_not_exist(self): - with pytest.raises(FileNotFoundError): + with pytest.raises(AugurError) as e_info: augur.filter.read_priority_scores("/does/not/exist.txt") + assert str(e_info.value) == "ERROR: missing or malformed priority scores file /does/not/exist.txt" def test_filter_on_query_good(self, tmpdir, sequences): """Basic filter_on_query test"""