From eb36c696a68c17ea7db8d2adf0924c6233dbd47f Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 17 May 2022 10:05:27 -0700 Subject: [PATCH 1/2] Report min and max date separately This allows for more detail in the report, and makes it more consistent by having one report line per parameter. --- augur/filter.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index cd3b0a243..2633aab4f 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -342,6 +342,22 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): return filtered +def filter_by_min_date(metadata, min_date, **kwargs): + """Filter metadata by minimum date. + + Alias to filter_by_date using min_date only. + """ + return filter_by_date(metadata, min_date=min_date, **kwargs) + + +def filter_by_max_date(metadata, max_date, **kwargs): + """Filter metadata by maximum date. + + Alias to filter_by_date using max_date only. + """ + return filter_by_date(metadata, max_date=max_date, **kwargs) + + def filter_by_sequence_index(metadata, sequence_index): """Filter metadata by presence of corresponding entries in a given sequence index. This filter effectively intersects the strain ids in the metadata and @@ -616,14 +632,21 @@ def construct_filters(args, sequence_index): } )) - # Filter by date. - if args.min_date or args.max_date: + # Filter by min/max date. + if args.min_date: exclude_by.append(( - filter_by_date, + filter_by_min_date, { - "date_column": "date", "min_date": args.min_date, + "date_column": "date", + } + )) + if args.max_date: + exclude_by.append(( + filter_by_max_date, + { "max_date": args.max_date, + "date_column": "date", } )) @@ -1659,7 +1682,8 @@ def run(args): "filter_by_exclude_where": "{count} of these were dropped because of '{exclude_where}'", "filter_by_query": "{count} of these were filtered out by the query: \"{query}\"", "filter_by_ambiguous_date": "{count} of these were dropped because of their ambiguous date in {ambiguity}", - "filter_by_date": "{count} of these were dropped because of their date (or lack of date)", + "filter_by_min_date": "{count} of these were dropped because they were earlier than {min_date} or missing a date", + "filter_by_max_date": "{count} of these were dropped because they were later than {max_date} or missing a date", "filter_by_sequence_length": "{count} of these were dropped because they were shorter than minimum length of {min_length}bp", "filter_by_non_nucleotide": "{count} of these were dropped because they had non-nucleotide characters", "skip_group_by_with_ambiguous_year": "{count} were dropped during grouping due to ambiguous year information", From 1cdde926f405cf6ac134861440119f045c6d60d4 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 17 May 2022 10:06:09 -0700 Subject: [PATCH 2/2] Add cram test for report output of filtering by min/max date --- tests/functional/filter.t | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/functional/filter.t b/tests/functional/filter.t index e0fd55997..691dc08f9 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -499,3 +499,15 @@ Error on missing group-by columns. $ cat $TMP/metadata-filtered.tsv cat: .*: No such file or directory (re) [1] + +Check output of min/max date filters. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --min-date 2015-01-01 \ + > --max-date 2016-02-01 \ + > --output-metadata "$TMP/filtered_metadata.tsv" + 8 strains were dropped during filtering + \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc) + \t7 of these were dropped because they were later than 2016.09 or missing a date (esc) + 4 strains passed all filters