Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Pandas 1.2.0 #336

Merged
merged 1 commit into from
Dec 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .ci/run-elasticsearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ if [[ "$ELASTICSEARCH_VERSION" != *oss* ]]; then
url="http://elastic:$ELASTIC_PASSWORD@$NODE_NAME"
fi

# Pull the container, retry on failures up to 5 times with
# short delays between each attempt. Fixes most transient network errors.
docker_pull_attempts=0
until [ "$docker_pull_attempts" -ge 5 ]
do
docker pull docker.elastic.co/elasticsearch/"$ELASTICSEARCH_VERSION" && break
docker_pull_attempts=$((docker_pull_attempts+1))
sleep 10
done

echo -e "\033[34;1mINFO:\033[0m Starting container $NODE_NAME \033[0m"
set -x
docker run \
Expand Down
3 changes: 1 addition & 2 deletions .ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT
- 7.10-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT

TEST_SUITE:
- xpack

PYTHON_VERSION:
- 3.9
- 3.8
- 3.7
- 3.6
Expand Down
2 changes: 1 addition & 1 deletion eland/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def mean(self, numeric_only: bool = True) -> "pd.DataFrame":
... "localhost", "flights",
... columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"]
... )
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +NORMALIZE_WHITESPACE
>>> df.groupby("DestCountry").mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice Cancelled dayOfWeek timestamp
DestCountry
AE 605.132970 0.152174 2.695652 2018-01-21 16:58:07.891304443
Expand Down
4 changes: 2 additions & 2 deletions eland/ml/ml_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def predict(
>>> regressor = regressor.fit(training_data[0], training_data[1])

>>> # Get some test results
>>> regressor.predict(np.array(test_data))
>>> regressor.predict(np.array(test_data)) # doctest: +SKIP
array([0.06062475, 0.9990102 ], dtype=float32)

>>> # Serialise the model to Elasticsearch
Expand All @@ -123,7 +123,7 @@ def predict(
>>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')

>>> # Get some test results from Elasticsearch model
>>> es_model.predict(test_data)
>>> es_model.predict(test_data) # doctest: +SKIP
array([0.0606248 , 0.99901026], dtype=float32)

>>> # Delete model from Elasticsearch
Expand Down
24 changes: 12 additions & 12 deletions eland/ndframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.mean()
>>> df.mean() # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
Expand All @@ -227,7 +227,7 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 2.835975
dtype: float64

>>> df.mean(numeric_only=False)
>>> df.mean(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 628.254
Cancelled 0.128494
dayOfWeek 2.83598
Expand Down Expand Up @@ -263,7 +263,7 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.sum()
>>> df.sum() # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
Expand All @@ -275,7 +275,7 @@ def sum(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 3.703500e+04
dtype: float64

>>> df.sum(numeric_only=False)
>>> df.sum(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 8.20436e+06
Cancelled 1678
dayOfWeek 37035
Expand Down Expand Up @@ -311,7 +311,7 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.min()
>>> df.min() # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
Expand All @@ -324,7 +324,7 @@ def min(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 0.000000
dtype: float64

>>> df.min(numeric_only=False)
>>> df.min(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 100.021
Cancelled False
dayOfWeek 0
Expand Down Expand Up @@ -358,7 +358,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.var()
>>> df.var() # doctest: +SKIP
AvgTicketPrice 70964.570234
Cancelled 0.111987
dayOfWeek 3.761279
Expand All @@ -370,7 +370,7 @@ def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 3.761279
dtype: float64

>>> df.var(numeric_only=False)
>>> df.var(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 70964.6
Cancelled 0.111987
dayOfWeek 3.76128
Expand Down Expand Up @@ -404,7 +404,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.std()
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.407061
Cancelled 0.334664
dayOfWeek 1.939513
Expand All @@ -416,7 +416,7 @@ def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 1.939513
dtype: float64

>>> df.std(numeric_only=False)
>>> df.std(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 266.407
Cancelled 0.334664
dayOfWeek 1.93951
Expand Down Expand Up @@ -499,7 +499,7 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=["AvgTicketPrice", "Cancelled", "dayOfWeek", "timestamp", "DestCountry"])
>>> df.max()
>>> df.max() # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
Expand All @@ -512,7 +512,7 @@ def max(self, numeric_only: Optional[bool] = None) -> pd.Series:
dayOfWeek 6.000000
dtype: float64

>>> df.max(numeric_only=False)
>>> df.max(numeric_only=False) # doctest: +SKIP
AvgTicketPrice 1199.73
Cancelled True
dayOfWeek 6
Expand Down
22 changes: 17 additions & 5 deletions eland/plotting/_matplotlib/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@
import numpy as np
from pandas.core.dtypes.generic import ABCIndexClass
from pandas.plotting._matplotlib import converter
from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots

try: # pandas>=1.2.0
from pandas.plotting._matplotlib.tools import (
create_subplots,
flatten_axes,
set_ticks_props,
)
except ImportError: # pandas<1.2.0
from pandas.plotting._matplotlib.tools import (
_flatten as flatten_axes,
_set_ticks_props as set_ticks_props,
_subplots as create_subplots,
)

from eland.utils import try_sort

Expand Down Expand Up @@ -63,7 +75,7 @@ def hist_series(
ax.grid(grid)
axes = np.array([ax])

_set_ticks_props(
set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)

Expand Down Expand Up @@ -110,7 +122,7 @@ def hist_frame(
if naxes == 0:
raise ValueError("hist method requires numerical columns, " "nothing to plot.")

fig, axes = _subplots(
fig, axes = create_subplots(
naxes=naxes,
ax=ax,
squeeze=False,
Expand All @@ -119,7 +131,7 @@ def hist_frame(
figsize=figsize,
layout=layout,
)
_axes = _flatten(axes)
_axes = flatten_axes(axes)

for i, col in enumerate(try_sort(data.columns)):
ax = _axes[i]
Expand All @@ -132,7 +144,7 @@ def hist_frame(
ax.set_title(col)
ax.grid(grid)

_set_ticks_props(
set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
fig.subplots_adjust(wspace=0.3, hspace=0.3)
Expand Down
35 changes: 22 additions & 13 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,18 @@ def lint(session):
session.error("\n" + "\n".join(sorted(set(errors))))


@nox.session(python=["3.6", "3.7", "3.8"])
@nox.session(python=["3.6", "3.7", "3.8", "3.9"])
def test(session):
session.install("-r", "requirements-dev.txt")
session.run("python", "-m", "tests.setup_tests")
session.install(".")

# Notebooks are only run on Python 3.7+ due to pandas 1.2.0
if session.python == "3.6":
nbval = ()
else:
nbval = ("--nbval",)

session.run(
"python",
"-m",
Expand All @@ -102,21 +109,23 @@ def test(session):
"term-missing",
"--cov=eland/",
"--doctest-modules",
"--nbval",
*nbval,
*(session.posargs or ("eland/", "tests/")),
)

session.run(
"python",
"-m",
"pip",
"uninstall",
"--yes",
"scikit-learn",
"xgboost",
"lightgbm",
)
session.run("pytest", "tests/ml/")
# Only run during default test execution
if not session.posargs:
session.run(
"python",
"-m",
"pip",
"uninstall",
"--yes",
"scikit-learn",
"xgboost",
"lightgbm",
)
session.run("pytest", "tests/ml/")


@nox.session(reuse_venv=True)
Expand Down
28 changes: 28 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

import os
from datetime import timedelta

import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal
Expand Down Expand Up @@ -106,3 +107,30 @@ def assert_pandas_eland_series_equal(left, right, **kwargs):

# Use pandas tests to check similarity
assert_series_equal(left, right.to_pandas(), **kwargs)


def assert_almost_equal(left, right, **kwargs):
"""Asserts left and right are almost equal. Left and right
can be scalars, series, dataframes, etc
"""
if isinstance(left, (ed.DataFrame, ed.Series)):
left = left.to_pandas()
if isinstance(right, (ed.DataFrame, ed.Series)):
right = right.to_pandas()

if isinstance(right, pd.DataFrame):
kwargs.setdefault("check_exact", True)
assert_frame_equal(left, right)
elif isinstance(right, pd.Series):
kwargs.setdefault("check_exact", True)
assert_series_equal(left, right)
elif isinstance(right, float):
assert right * 0.99 <= left <= right * 1.01
elif isinstance(right, pd.Timestamp):
assert isinstance(left, pd.Timestamp) and right - timedelta(
seconds=0.1
) < left < right + timedelta(seconds=0.1)
elif right is pd.NaT:
assert left is pd.NaT
else:
assert left == right, f"{left} != {right}"
12 changes: 7 additions & 5 deletions tests/dataframe/test_metrics_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal

from tests.common import TestData
from tests.common import TestData, assert_almost_equal


class TestDataFrameMetrics(TestData):
Expand Down Expand Up @@ -181,7 +181,9 @@ def test_flights_datetime_metrics_agg(self):
)
ed_metrics_dict = ed_metrics["timestamp"].to_dict()
ed_metrics_dict.pop("median") # Median is tested below.
assert ed_metrics_dict == expected_values

for key, expected_value in expected_values.items():
assert_almost_equal(ed_metrics_dict[key], expected_value)

@pytest.mark.parametrize("agg", ["mean", "min", "max", "nunique"])
def test_flights_datetime_metrics_single_agg(self, agg):
Expand All @@ -200,7 +202,7 @@ def test_flights_datetime_metrics_single_agg(self, agg):
else:
# df with timestamp column should return datetime64[ns]
assert ed_metric.dtypes["timestamp"] == np.dtype("datetime64[ns]")
assert ed_metric["timestamp"][0] == expected_values[agg]
assert_almost_equal(ed_metric["timestamp"][0], expected_values[agg])

@pytest.mark.parametrize("agg", ["mean", "min", "max"])
def test_flights_datetime_metrics_agg_func(self, agg):
Expand All @@ -213,7 +215,7 @@ def test_flights_datetime_metrics_agg_func(self, agg):
ed_metric = getattr(ed_timestamps, agg)(numeric_only=False)

assert ed_metric.dtype == np.dtype("datetime64[ns]")
assert ed_metric[0] == expected_values[agg]
assert_almost_equal(ed_metric[0], expected_values[agg])

def test_flights_datetime_metrics_median(self):
ed_df = self.ed_flights_small()[["timestamp"]]
Expand Down Expand Up @@ -283,7 +285,7 @@ def test_flights_numeric_only(self):
else:
assert_series_equal(
agg_data[agg].rename(None),
getattr(pd_flights, agg)(numeric_only=True),
getattr(pd_flights, agg)(numeric_only=True).astype(float),
check_exact=False,
rtol=True,
)
Expand Down
6 changes: 3 additions & 3 deletions tests/notebook/test_demo_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2816,7 +2816,7 @@
" 25 dayOfWeek 13059 non-null int64 \n",
" 26 timestamp 13059 non-null datetime64[ns]\n",
"dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
"memory usage: 3.2+ MB\n"
"memory usage: 3.1+ MB\n"
]
}
],
Expand Down Expand Up @@ -2924,7 +2924,7 @@
}
],
"source": [
"pd_flights.max(numeric_only=True)"
"pd_flights.max(numeric_only=True).astype(float)"
]
},
{
Expand Down Expand Up @@ -3004,7 +3004,7 @@
}
],
"source": [
"pd_flights.min(numeric_only=True)"
"pd_flights.min(numeric_only=True).astype(float)"
]
},
{
Expand Down
Loading