From 6a5caef7f3bbb391dddd7e3e1f3c85da40e5684d Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Mon, 26 Jul 2021 22:04:05 +0530 Subject: [PATCH] Add support to Pandas and lightgbm latest version --- eland/common.py | 4 ++- eland/etl.py | 41 ++++++++++++++++++++++++++---- eland/plotting/_matplotlib/hist.py | 10 ++++++-- requirements-dev.txt | 4 +-- setup.py | 2 +- 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/eland/common.py b/eland/common.py index a5a2afd4..c14a4945 100644 --- a/eland/common.py +++ b/eland/common.py @@ -26,12 +26,14 @@ # Default number of rows displayed (different to pandas where ALL could be displayed) DEFAULT_NUM_ROWS_DISPLAYED = 60 - DEFAULT_CHUNK_SIZE = 10000 DEFAULT_CSV_BATCH_OUTPUT_SIZE = 10000 DEFAULT_PROGRESS_REPORTING_NUM_ROWS = 10000 DEFAULT_ES_MAX_RESULT_WINDOW = 10000 # index.max_result_window DEFAULT_PAGINATION_SIZE = 5000 # for composite aggregations +PANDAS_VERSION: Tuple[int, ...] = tuple( + int(part) for part in pd.__version__.split(".") if part.isdigit() +)[:2] with warnings.catch_warnings(): diff --git a/eland/etl.py b/eland/etl.py index 8dc3fee2..31c88392 100644 --- a/eland/etl.py +++ b/eland/etl.py @@ -22,12 +22,18 @@ import pandas as pd # type: ignore from elasticsearch import Elasticsearch from elasticsearch.helpers import parallel_bulk -from pandas.io.parsers import _c_parser_defaults # type: ignore from eland import DataFrame -from eland.common import DEFAULT_CHUNK_SIZE, ensure_es_client +from eland.common import DEFAULT_CHUNK_SIZE, PANDAS_VERSION, ensure_es_client from eland.field_mappings import FieldMappings, verify_mapping_compatibility +try: + from pandas.io.parsers import _c_parser_defaults # type: ignore +except ImportError: + from pandas.io.parsers.readers import _c_parser_defaults # type: ignore + +_DEFAULT_LOW_MEMORY: bool = _c_parser_defaults["low_memory"] + def pandas_to_eland( pd_df: pd.DataFrame, @@ -339,11 +345,12 @@ def csv_to_eland( # type: ignore encoding=None, dialect=None, # Error Handling - error_bad_lines=True, - warn_bad_lines=True, + warn_bad_lines: bool = True, + error_bad_lines: bool = True, + on_bad_lines: str = "error", # Internal delim_whitespace=False, - low_memory=_c_parser_defaults["low_memory"], + low_memory: bool = _DEFAULT_LOW_MEMORY, memory_map=False, float_precision=None, ) -> "DataFrame": @@ -481,6 +488,7 @@ def csv_to_eland( # type: ignore "delim_whitespace": delim_whitespace, "warn_bad_lines": warn_bad_lines, "error_bad_lines": error_bad_lines, + "on_bad_lines": on_bad_lines, "low_memory": low_memory, "mangle_dupe_cols": mangle_dupe_cols, "infer_datetime_format": infer_datetime_format, @@ -490,6 +498,29 @@ def csv_to_eland( # type: ignore if chunksize is None: kwargs["chunksize"] = DEFAULT_CHUNK_SIZE + if PANDAS_VERSION >= (1, 3): + # Bug in Pandas v1.3.0 + # If names and prefix both passed as None, it's considering them as specified values and throwing ValueError + # Ref: https://github.com/pandas-dev/pandas/issues/42387 + if kwargs["names"] is None and kwargs["prefix"] is None: + kwargs.pop("prefix") + + if kwargs["warn_bad_lines"] is True: + kwargs["on_bad_lines"] = "warn" + if kwargs["error_bad_lines"] is True: + kwargs["on_bad_lines"] = "error" + + kwargs.pop("warn_bad_lines") + kwargs.pop("error_bad_lines") + + else: + if on_bad_lines == "warn": + kwargs["warn_bad_lines"] = True + if on_bad_lines == "error": + kwargs["error_bad_lines"] = True + + kwargs.pop("on_bad_lines") + # read csv in chunks to pandas DataFrame and dump to eland DataFrame (and Elasticsearch) reader = pd.read_csv(filepath_or_buffer, **kwargs) diff --git a/eland/plotting/_matplotlib/hist.py b/eland/plotting/_matplotlib/hist.py index d6ca9b3c..5769710e 100644 --- a/eland/plotting/_matplotlib/hist.py +++ b/eland/plotting/_matplotlib/hist.py @@ -16,9 +16,15 @@ # under the License. import numpy as np -from pandas.core.dtypes.generic import ABCIndexClass from pandas.plotting._matplotlib import converter +try: + # pandas<1.3.0 + from pandas.core.dtypes.generic import ABCIndexClass as ABCIndex +except ImportError: + # pandas>=1.3.0 + from pandas.core.dtypes.generic import ABCIndex + try: # pandas>=1.2.0 from pandas.plotting._matplotlib.tools import ( create_subplots, @@ -113,7 +119,7 @@ def hist_frame( raise NotImplementedError("TODO") if column is not None: - if not isinstance(column, (list, np.ndarray, ABCIndexClass)): + if not isinstance(column, (list, np.ndarray, ABCIndex)): column = [column] ed_df_bins = ed_df_bins[column] ed_df_weights = ed_df_weights[column] diff --git a/requirements-dev.txt b/requirements-dev.txt index 553c7393..07ffb122 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ elasticsearch>=7.7 -pandas>=1 +pandas>=1.2.0 matplotlib pytest>=5.2.1 pytest-mock @@ -8,5 +8,5 @@ numpydoc>=0.9.0 scikit-learn>=0.22.1 xgboost>=1 nox -lightgbm>=2.3.0 +lightgbm pytest-cov diff --git a/setup.py b/setup.py index fe4deee9..14951540 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,6 @@ extras_require={ "xgboost": ["xgboost>=0.90,<2"], "scikit-learn": ["scikit-learn>=0.22.1,<1"], - "lightgbm": ["lightgbm>=2,<2.4"], + "lightgbm": ["lightgbm>=2,<4"], }, )