Skip to content

Commit

Permalink
utils: Fix numeric date handling (#841)
Browse files Browse the repository at this point in the history
Currently, date format handling is inaccurate (#747) as numeric dates are thrown out:

https://github.com/nextstrain/augur/blob/a85194c243db8d85e6fc06ea2d614e0b6095a0c4/augur/utils.py#L115-L119

This change ensures numeric dates are processed, and that non-negative integers are evaluated as year-only ambiguous dates.

Also including a few refactors:

- Remove `raise_error` parameter. The intent is unclear and tests still pass without it.
- Use `return` instead of an intermediate variable.

Testing:

- Add broken tests and verify new changes pass.
- Fix inaccurate existing tests.
  • Loading branch information
victorlin authored Feb 4, 2022
1 parent 0268940 commit 84f11b4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 35 deletions.
53 changes: 22 additions & 31 deletions augur/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import re
import Bio
import Bio.Phylo
from datetime import datetime
Expand Down Expand Up @@ -111,25 +112,23 @@ def is_date_ambiguous(date, ambiguous_by="any"):
"X" in day and ambiguous_by in ("any", "day")
))

def get_numerical_date_from_value(value, fmt=None, min_max_year=None, raise_error=True):
if type(value)!=str:
if raise_error:
raise ValueError(value)
else:
numerical_date = None
elif 'XX' in value:
def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
value = str(value)
if re.match(r'^-*\d+\.\d+$', value):
# numeric date which can be negative
return float(value)
if value.isnumeric():
# year-only date is ambiguous
value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX')
if 'XX' in value:
ambig_date = ambiguous_date_to_date_range(value, fmt, min_max_year)
if ambig_date is None or None in ambig_date:
numerical_date = [None, None] #don't send to numeric_date or will be set to today
else:
numerical_date = [numeric_date(d) for d in ambig_date]
else:
try:
numerical_date = numeric_date(datetime.strptime(value, fmt))
except:
numerical_date = None

return numerical_date
return [None, None] #don't send to numeric_date or will be set to today
return [numeric_date(d) for d in ambig_date]
try:
return numeric_date(datetime.strptime(value, fmt))
except:
return None

def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
if fmt:
Expand All @@ -138,26 +137,18 @@ def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, m
if isinstance(meta_dict, dict):
for k,m in meta_dict.items():
v = m[date_col]
try:
numerical_dates[k] = get_numerical_date_from_value(
v,
fmt,
min_max_year
)
except ValueError:
print(
"WARNING: %s has an invalid data string: %s"% (k, v),
file=sys.stderr
)
continue
numerical_dates[k] = get_numerical_date_from_value(
v,
fmt,
min_max_year
)
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].apply(
lambda date: get_numerical_date_from_value(
date,
fmt,
min_max_year,
raise_error=False
min_max_year
)
).values
numerical_dates = dict(zip(strains, dates))
Expand Down
8 changes: 4 additions & 4 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@ Filter using only metadata without sequence input or output and save results as
> --min-length 10500 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null

Output should include the 7 sequences matching the filters and a header line.
Output should include the 8 sequences matching the filters and a header line.

$ wc -l "$TMP/filtered_metadata.tsv"
\s*8 .* (re)
\s*9 .* (re)
$ rm -f "$TMP/filtered_metadata.tsv"

Filter using only metadata and save results as a list of filtered strains.
Expand All @@ -176,10 +176,10 @@ Filter using only metadata and save results as a list of filtered strains.
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

Output should include only the 7 sequences matching the filters (without a header line).
Output should include only the 8 sequences matching the filters (without a header line).

$ wc -l "$TMP/filtered_strains.txt"
\s*7 .* (re)
\s*8 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter using only metadata without a sequence index.
Expand Down
28 changes: 28 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,31 @@ def test_filter_run_max_date(self, tmpdir, fasta_fn, argparser):
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_1", "SEQ_2"]

def test_filter_incomplete_year(self, tmpdir, fasta_fn, argparser):
"""Test that 2020 is evaluated as 2020-XX-XX"""
out_fn = str(tmpdir / "out.fasta")
min_date = "2020-02-01"
meta_fn = write_metadata(tmpdir, (("strain","date"),
("SEQ_1","2020.0"),
("SEQ_2","2020"),
("SEQ_3","2020-XX-XX")))
args = argparser('-s %s --metadata %s -o %s --min-date %s'
% (fasta_fn, meta_fn, out_fn, min_date))
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_2", "SEQ_3"]

def test_filter_date_formats(self, tmpdir, fasta_fn, argparser):
"""Test that 2020.0, 2020, and 2020-XX-XX all pass --min-date 2019"""
out_fn = str(tmpdir / "out.fasta")
min_date = "2019"
meta_fn = write_metadata(tmpdir, (("strain","date"),
("SEQ_1","2020.0"),
("SEQ_2","2020"),
("SEQ_3","2020-XX-XX")))
args = argparser('-s %s --metadata %s -o %s --min-date %s'
% (fasta_fn, meta_fn, out_fn, min_date))
augur.filter.run(args)
output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
assert list(output.keys()) == ["SEQ_1", "SEQ_2", "SEQ_3"]

0 comments on commit 84f11b4

Please sign in to comment.