Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dates: Move date logic from utils to dates #923

Merged
merged 3 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions augur/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
import datetime
from textwrap import dedent
import isodate
import pandas as pd
import re
import treetime.utils

from augur.util_support.date_disambiguator import DateDisambiguator

SUPPORTED_DATE_HELP_TEXT = dedent("""\
1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
Expand Down Expand Up @@ -63,3 +67,87 @@ def numeric_date_type(date):
return numeric_date(date)
except ValueError as e:
raise argparse.ArgumentTypeError(str(e)) from e

def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None):
return DateDisambiguator(uncertain_date, fmt=fmt, min_max_year=min_max_year).range()

def is_date_ambiguous(date, ambiguous_by="any"):
"""
Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts).

Parameters
----------
date : str
Date string in the format of YYYY-MM-DD
ambiguous_by : str
Field of the date string to test for ambiguity ("day", "month", "year", "any")
"""
date_components = date.split('-', 2)

if len(date_components) == 3:
year, month, day = date_components
elif len(date_components) == 2:
year, month = date_components
day = "XX"
else:
year = date_components[0]
month = "XX"
day = "XX"

# Determine ambiguity hierarchically such that, for example, an ambiguous
# month implicates an ambiguous day even when day information is available.
return any((
"X" in year,
"X" in month and ambiguous_by in ("any", "month", "day"),
"X" in day and ambiguous_by in ("any", "day")
))

def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
value = str(value)
if re.match(r'^-*\d+\.\d+$', value):
# numeric date which can be negative
return float(value)
if value.isnumeric():
# year-only date is ambiguous
value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX')
if 'XX' in value:
ambig_date = ambiguous_date_to_date_range(value, fmt, min_max_year)
if ambig_date is None or None in ambig_date:
return [None, None] #don't send to numeric_date or will be set to today
return [treetime.utils.numeric_date(d) for d in ambig_date]
try:
return treetime.utils.numeric_date(datetime.datetime.strptime(value, fmt))
except:
return None

def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
if fmt:
numerical_dates = {}

if isinstance(meta_dict, dict):
for k,m in meta_dict.items():
v = m[date_col]
numerical_dates[k] = get_numerical_date_from_value(
v,
fmt,
min_max_year
)
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].apply(
lambda date: get_numerical_date_from_value(
date,
fmt,
min_max_year
)
).values
numerical_dates = dict(zip(strains, dates))
else:
if isinstance(meta_dict, dict):
numerical_dates = {k:float(v) for k,v in meta_dict.items()}
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].astype(float)
numerical_dates = dict(zip(strains, dates))

return numerical_dates
4 changes: 2 additions & 2 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from tempfile import NamedTemporaryFile
from typing import Collection

from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from .dates import numeric_date, numeric_date_type, SUPPORTED_DATE_HELP_TEXT, is_date_ambiguous, get_numerical_dates
from .index import index_sequences, index_vcf
from .io import open_file, read_metadata, read_sequences, write_sequences
from .utils import AugurError, is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
from .utils import AugurError, is_vcf as filename_is_vcf, read_vcf, read_strains, run_shell_command, shquote

comment_char = '#'

Expand Down
4 changes: 2 additions & 2 deletions augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

from .frequency_estimators import get_pivots, alignment_frequencies, tree_frequencies
from .frequency_estimators import AlignmentKdeFrequencies, TreeKdeFrequencies, TreeKdeFrequenciesError
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from .utils import read_metadata, read_node_data, write_json, get_numerical_dates
from .dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT, get_numerical_dates
from .utils import read_metadata, read_node_data, write_json


def register_arguments(parser):
Expand Down
3 changes: 2 additions & 1 deletion augur/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import sys

from .io import open_file, read_sequences, write_sequences
from .utils import AugurError, get_numerical_date_from_value
from .dates import get_numerical_date_from_value
from .utils import AugurError

forbidden_characters = str.maketrans(
{' ': None,
Expand Down
3 changes: 2 additions & 1 deletion augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import numpy as np
import os, shutil, time, sys
from Bio import Phylo
from .utils import read_metadata, read_tree, get_numerical_dates, write_json, InvalidTreeError
from .dates import get_numerical_dates
from .utils import read_metadata, read_tree, write_json, InvalidTreeError
from treetime.vcf_utils import read_vcf, write_vcf
from treetime.seq_utils import profile_maps

Expand Down
88 changes: 0 additions & 88 deletions augur/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import argparse
import re
import Bio
import Bio.Phylo
from datetime import datetime
import gzip
import os, json, sys
import pandas as pd
import subprocess
import shlex
from contextlib import contextmanager
from treetime.utils import numeric_date
from collections import defaultdict, OrderedDict
from pkg_resources import resource_stream
from io import TextIOWrapper
Expand All @@ -18,7 +15,6 @@
from augur.io import open_file

from augur.util_support.color_parser import ColorParser
from augur.util_support.date_disambiguator import DateDisambiguator
from augur.util_support.metadata_file import MetadataFile
from augur.util_support.node_data_reader import NodeDataReader
from augur.util_support.shell_command_runner import ShellCommandRunner
Expand Down Expand Up @@ -75,93 +71,9 @@ def get_json_name(args, default=None):
raise ValueError("Please specify a name for the JSON file containing the results.")


def ambiguous_date_to_date_range(uncertain_date, fmt, min_max_year=None):
return DateDisambiguator(uncertain_date, fmt=fmt, min_max_year=min_max_year).range()

def read_metadata(fname, query=None, as_data_frame=False):
return MetadataFile(fname, query, as_data_frame).read()

def is_date_ambiguous(date, ambiguous_by="any"):
"""
Returns whether a given date string in the format of YYYY-MM-DD is ambiguous by a given part of the date (e.g., day, month, year, or any parts).

Parameters
----------
date : str
Date string in the format of YYYY-MM-DD
ambiguous_by : str
Field of the date string to test for ambiguity ("day", "month", "year", "any")
"""
date_components = date.split('-', 2)

if len(date_components) == 3:
year, month, day = date_components
elif len(date_components) == 2:
year, month = date_components
day = "XX"
else:
year = date_components[0]
month = "XX"
day = "XX"

# Determine ambiguity hierarchically such that, for example, an ambiguous
# month implicates an ambiguous day even when day information is available.
return any((
"X" in year,
"X" in month and ambiguous_by in ("any", "month", "day"),
"X" in day and ambiguous_by in ("any", "day")
))

def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
value = str(value)
if re.match(r'^-*\d+\.\d+$', value):
# numeric date which can be negative
return float(value)
if value.isnumeric():
# year-only date is ambiguous
value = fmt.replace('%Y', value).replace('%m', 'XX').replace('%d', 'XX')
if 'XX' in value:
ambig_date = ambiguous_date_to_date_range(value, fmt, min_max_year)
if ambig_date is None or None in ambig_date:
return [None, None] #don't send to numeric_date or will be set to today
return [numeric_date(d) for d in ambig_date]
try:
return numeric_date(datetime.strptime(value, fmt))
except:
return None

def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
if fmt:
numerical_dates = {}

if isinstance(meta_dict, dict):
for k,m in meta_dict.items():
v = m[date_col]
numerical_dates[k] = get_numerical_date_from_value(
v,
fmt,
min_max_year
)
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].apply(
lambda date: get_numerical_date_from_value(
date,
fmt,
min_max_year
)
).values
numerical_dates = dict(zip(strains, dates))
else:
if isinstance(meta_dict, dict):
numerical_dates = {k:float(v) for k,v in meta_dict.items()}
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].astype(float)
numerical_dates = dict(zip(strains, dates))

return numerical_dates


class InvalidTreeError(Exception):
"""Represents an error loading a phylogenetic tree from a filename.
Expand Down
59 changes: 59 additions & 0 deletions tests/test_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import datetime
from freezegun import freeze_time
from augur import dates


class TestDates:
def test_ambiguous_date_to_date_range_not_ambiguous(self):
assert dates.ambiguous_date_to_date_range("2000-03-29", "%Y-%m-%d") == (
datetime.date(year=2000, month=3, day=29),
datetime.date(year=2000, month=3, day=29),
)

def test_ambiguous_date_to_date_range_ambiguous_day(self):
assert dates.ambiguous_date_to_date_range("2000-01-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=1),
datetime.date(year=2000, month=1, day=31),
)

def test_ambiguous_date_to_date_range_ambiguous_month_and_day(self):
assert dates.ambiguous_date_to_date_range("2000-XX-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=1),
datetime.date(year=2000, month=12, day=31),
)

@freeze_time("2000-02-20")
def test_ambiguous_date_to_date_range_current_day_limit(self):
assert dates.ambiguous_date_to_date_range("2000-02-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=2, day=1),
datetime.date(year=2000, month=2, day=20),
)

def test_is_date_ambiguous(self):
"""is_date_ambiguous should return true for ambiguous dates and false for valid dates."""
# Test complete date strings with ambiguous values.
assert dates.is_date_ambiguous("2019-0X-0X", "any")
assert dates.is_date_ambiguous("2019-XX-09", "month")
assert dates.is_date_ambiguous("2019-03-XX", "day")
assert dates.is_date_ambiguous("201X-03-09", "year")
assert dates.is_date_ambiguous("20XX-01-09", "month")
assert dates.is_date_ambiguous("2019-XX-03", "day")
assert dates.is_date_ambiguous("20XX-01-03", "day")

# Test incomplete date strings with ambiguous values.
assert dates.is_date_ambiguous("2019", "any")
assert dates.is_date_ambiguous("201X", "year")
assert dates.is_date_ambiguous("2019-XX", "month")
assert dates.is_date_ambiguous("2019-10", "day")
assert dates.is_date_ambiguous("2019-XX", "any")
assert dates.is_date_ambiguous("2019-XX", "day")

# Test complete date strings without ambiguous dates for the requested field.
assert not dates.is_date_ambiguous("2019-09-03", "any")
assert not dates.is_date_ambiguous("2019-03-XX", "month")
assert not dates.is_date_ambiguous("2019-09-03", "day")
assert not dates.is_date_ambiguous("2019-XX-XX", "year")

# Test incomplete date strings without ambiguous dates for the requested fields.
assert not dates.is_date_ambiguous("2019", "year")
assert not dates.is_date_ambiguous("2019-10", "month")
56 changes: 0 additions & 56 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,12 @@
import datetime
from pathlib import Path
from unittest.mock import patch

import pytest
from freezegun import freeze_time

from augur import utils
from test_filter import write_metadata

class TestUtils:
def test_ambiguous_date_to_date_range_not_ambiguous(self):
assert utils.ambiguous_date_to_date_range("2000-03-29", "%Y-%m-%d") == (
datetime.date(year=2000, month=3, day=29),
datetime.date(year=2000, month=3, day=29),
)

def test_ambiguous_date_to_date_range_ambiguous_day(self):
assert utils.ambiguous_date_to_date_range("2000-01-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=1),
datetime.date(year=2000, month=1, day=31),
)

def test_ambiguous_date_to_date_range_ambiguous_month_and_day(self):
assert utils.ambiguous_date_to_date_range("2000-XX-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=1, day=1),
datetime.date(year=2000, month=12, day=31),
)

@freeze_time("2000-02-20")
def test_ambiguous_date_to_date_range_current_day_limit(self):
assert utils.ambiguous_date_to_date_range("2000-02-XX", "%Y-%m-%d") == (
datetime.date(year=2000, month=2, day=1),
datetime.date(year=2000, month=2, day=20),
)

@pytest.mark.parametrize("extension", ["bed","BED"])
@patch('augur.utils.read_bed_file')
def test_load_mask_sites_recognizes_bed_file(self, m_read_bed_file, extension):
Expand Down Expand Up @@ -103,35 +76,6 @@ def test_read_mask_file_drm_file(self, tmpdir):
fh.write("\n".join(drm_lines))
assert utils.read_mask_file(drm_file) == expected_sites

def test_is_date_ambiguous(self):
"""is_date_ambiguous should return true for ambiguous dates and false for valid dates."""
# Test complete date strings with ambiguous values.
assert utils.is_date_ambiguous("2019-0X-0X", "any")
assert utils.is_date_ambiguous("2019-XX-09", "month")
assert utils.is_date_ambiguous("2019-03-XX", "day")
assert utils.is_date_ambiguous("201X-03-09", "year")
assert utils.is_date_ambiguous("20XX-01-09", "month")
assert utils.is_date_ambiguous("2019-XX-03", "day")
assert utils.is_date_ambiguous("20XX-01-03", "day")

# Test incomplete date strings with ambiguous values.
assert utils.is_date_ambiguous("2019", "any")
assert utils.is_date_ambiguous("201X", "year")
assert utils.is_date_ambiguous("2019-XX", "month")
assert utils.is_date_ambiguous("2019-10", "day")
assert utils.is_date_ambiguous("2019-XX", "any")
assert utils.is_date_ambiguous("2019-XX", "day")

# Test complete date strings without ambiguous dates for the requested field.
assert not utils.is_date_ambiguous("2019-09-03", "any")
assert not utils.is_date_ambiguous("2019-03-XX", "month")
assert not utils.is_date_ambiguous("2019-09-03", "day")
assert not utils.is_date_ambiguous("2019-XX-XX", "year")

# Test incomplete date strings without ambiguous dates for the requested fields.
assert not utils.is_date_ambiguous("2019", "year")
assert not utils.is_date_ambiguous("2019-10", "month")

def test_read_strains(self, tmpdir):
# Write one list of filenames with some unnecessary whitespace.
strains1 = Path(tmpdir) / Path("strains1.txt")
Expand Down