Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document DataFrame.groupby() and rename Field.index -> .column #291

Merged
merged 2 commits into from
Oct 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ currently using a minimum version of PyCharm 2019.2.4.
- Run `pytest --nbval` to validate install
- To test specific versions of Python use `nox -s test-3.8`
- To run the automatic formatter and check for lint issues
run `nox -s blacken`
run `nox -s format`


### Documentation
Expand Down
8 changes: 4 additions & 4 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@

# -- Path setup --------------------------------------------------------------

import datetime
import os
import sys

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
import datetime

sys.path.insert(0, os.path.abspath("../sphinxext"))
sys.path.extend(
Expand Down
6 changes: 6 additions & 0 deletions docs/source/reference/api/eland.DataFrame.groupby.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
eland.DataFrame.groupby
=======================

.. currentmodule:: eland

.. automethod:: DataFrame.groupby
1 change: 1 addition & 0 deletions docs/source/reference/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Function Application, GroupBy & Window

DataFrame.agg
DataFrame.aggregate
DataFrame.groupby

.. _api.dataframe.stats:

Expand Down
22 changes: 11 additions & 11 deletions eland/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@
# specific language governing permissions and limitations
# under the License.

from eland._version import ( # noqa: F401
__title__,
__description__,
__url__,
__version__,
from ._version import ( # noqa: F401
__author__,
__author_email__,
__description__,
__maintainer__,
__maintainer_email__,
__title__,
__url__,
__version__,
)
from eland.common import SortOrder
from eland.index import Index
from eland.ndframe import NDFrame
from eland.series import Series
from eland.dataframe import DataFrame
from eland.etl import pandas_to_eland, eland_to_pandas, read_es, read_csv, csv_to_eland
from .common import SortOrder
from .dataframe import DataFrame
from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland, read_csv, read_es
from .index import Index
from .ndframe import NDFrame
from .series import Series

__all__ = [
"DataFrame",
Expand Down
4 changes: 2 additions & 2 deletions eland/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
# under the License.

from abc import ABC, abstractmethod
from typing import List, Optional, TYPE_CHECKING, Union
from eland import SortOrder
from typing import TYPE_CHECKING, List, Optional, Union

from eland import SortOrder

if TYPE_CHECKING:
import pandas as pd # type: ignore
Expand Down
2 changes: 1 addition & 1 deletion eland/arithmetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from abc import ABC, abstractmethod
from io import StringIO
from typing import Union, List, TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, List, Union

import numpy as np # type: ignore

Expand Down
2 changes: 1 addition & 1 deletion eland/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
import warnings
from enum import Enum
from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast

import numpy as np # type: ignore
import pandas as pd # type: ignore
Expand Down
21 changes: 9 additions & 12 deletions eland/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
# specific language governing permissions and limitations
# under the License.

import re
import sys
import warnings
from io import StringIO
import re
from typing import List, Optional, Sequence, Union, Tuple
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -34,12 +34,12 @@
from pandas.util._validators import validate_bool_kwarg

import eland.plotting as gfx
from eland.ndframe import NDFrame
from eland.series import Series
from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
from eland.filter import BooleanFilter
from eland.utils import deprecated_api, is_valid_attr_name
from eland.groupby import GroupByDataFrame
from eland.ndframe import NDFrame
from eland.series import Series
from eland.utils import deprecated_api, is_valid_attr_name


class DataFrame(NDFrame):
Expand Down Expand Up @@ -1442,13 +1442,10 @@ def groupby(
by:
column or list of columns used to groupby
Currently accepts column or list of columns
TODO Implement other combinations of by similar to pandas

dropna: default True
If True, and if group keys contain NA values, NA values together with row/column will be dropped.
TODO Implement False

TODO Implement remainder of pandas arguments
Returns
-------
GroupByDataFrame
Expand Down Expand Up @@ -1495,18 +1492,18 @@ def groupby(
[63 rows x 2 columns]
"""
if by is None:
raise TypeError("by parameter should be specified to groupby")
raise ValueError("by parameter should be specified to groupby")
if isinstance(by, str):
by = [by]
if isinstance(by, (list, tuple)):
remaining_columns = set(by) - set(self._query_compiler.columns)
remaining_columns = sorted(set(by) - set(self._query_compiler.columns))
if remaining_columns:
raise KeyError(
f"Requested columns {remaining_columns} not in the DataFrame."
f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
)

return GroupByDataFrame(
by=by, query_compiler=self._query_compiler, dropna=dropna
by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
)

def query(self, expr) -> "DataFrame":
Expand Down
9 changes: 5 additions & 4 deletions eland/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,18 @@
# under the License.

import csv
from typing import Generator, Union, List, Tuple, Optional, Mapping, Dict, Any
from collections import deque
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union

import pandas as pd # type: ignore
from elasticsearch import Elasticsearch # type: ignore
from elasticsearch.helpers import parallel_bulk # type: ignore
from pandas.io.parsers import _c_parser_defaults # type: ignore

from eland import DataFrame
from eland.common import DEFAULT_CHUNK_SIZE, ensure_es_client
from eland.field_mappings import FieldMappings, verify_mapping_compatibility
from eland.common import ensure_es_client, DEFAULT_CHUNK_SIZE
from eland.utils import deprecated_api
from elasticsearch import Elasticsearch # type: ignore
from elasticsearch.helpers import parallel_bulk # type: ignore


@deprecated_api("eland.DataFrame()")
Expand Down
51 changes: 26 additions & 25 deletions eland/field_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,32 @@
# under the License.

import warnings
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Mapping,
NamedTuple,
Optional,
Set,
Tuple,
)

import numpy as np
import pandas as pd
from pandas.core.dtypes.common import (
is_float_dtype,
is_bool_dtype,
is_integer_dtype,
is_datetime_or_timedelta_dtype,
is_float_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.core.dtypes.inference import is_list_like
from typing import (
NamedTuple,
Optional,
Mapping,
Dict,
Any,
Tuple,
TYPE_CHECKING,
List,
Set,
)

if TYPE_CHECKING:
from elasticsearch import Elasticsearch

from eland import DataFrame


Expand All @@ -64,7 +65,7 @@
class Field(NamedTuple):
"""Holds all information on a particular field in the mapping"""

index: str
column: str
es_field_name: str
is_source: bool
es_dtype: str
Expand Down Expand Up @@ -129,7 +130,7 @@ class FieldMappings:
_mappings_capabilities: pandas.DataFrame
A data frame summarising the capabilities of the index mapping

index - the eland display name
column (index) - the eland display name

es_field_name - the Elasticsearch field name
is_source - is top level field (i.e. not a multi-field sub-field)
Expand Down Expand Up @@ -537,13 +538,13 @@ def _generate_es_mappings(
"""

mapping_props = {}
for field_name_name, dtype in dataframe.dtypes.iteritems():
if es_type_overrides is not None and field_name_name in es_type_overrides:
es_dtype = es_type_overrides[field_name_name]
for column, dtype in dataframe.dtypes.iteritems():
if es_type_overrides is not None and column in es_type_overrides:
es_dtype = es_type_overrides[column]
else:
es_dtype = FieldMappings._pd_dtype_to_es_dtype(dtype)

mapping_props[field_name_name] = {"type": es_dtype}
mapping_props[column] = {"type": es_dtype}

return {"mappings": {"properties": mapping_props}}

Expand Down Expand Up @@ -708,9 +709,9 @@ def all_source_fields(self) -> List[Field]:

"""
source_fields: List[Field] = []
for index, row in self._mappings_capabilities.iterrows():
for column, row in self._mappings_capabilities.iterrows():
row = row.to_dict()
row["index"] = index
row["column"] = column
source_fields.append(Field(**row))
return source_fields

Expand All @@ -731,13 +732,13 @@ def groupby_source_fields(self, by: List[str]) -> Tuple[List[Field], List[Field]
groupby_fields: Dict[str, Field] = {}
# groupby_fields: Union[List[Field], List[None]] = [None] * len(by)
aggregatable_fields: List[Field] = []
for index_name, row in self._mappings_capabilities.iterrows():
for column, row in self._mappings_capabilities.iterrows():
row = row.to_dict()
row["index"] = index_name
if index_name not in by:
row["column"] = column
if column not in by:
aggregatable_fields.append(Field(**row))
else:
groupby_fields[index_name] = Field(**row)
groupby_fields[column] = Field(**row)

# Maintain groupby order as given input
return [groupby_fields[column] for column in by], aggregatable_fields
Expand Down
2 changes: 1 addition & 1 deletion eland/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# Originally based on code in MIT-licensed pandasticsearch filters

from typing import Dict, Any, List, Optional, Union, cast
from typing import Any, Dict, List, Optional, Union, cast


class BooleanFilter:
Expand Down
Loading