elastic · sethmlarson · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -160,7 +160,7 @@ currently using a minimum version of PyCharm 2019.2.4.
 -   Run `pytest --nbval` to validate install
 -   To test specific versions of Python use `nox -s test-3.8`
 -   To run the automatic formatter and check for lint issues
-    run `nox -s blacken`
+    run `nox -s format`
 
 
 ### Documentation

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,13 +23,13 @@
 
 # -- Path setup --------------------------------------------------------------
 
+import datetime
+import os
+import sys
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-import datetime
 
 sys.path.insert(0, os.path.abspath("../sphinxext"))
 sys.path.extend(

diff --git a/docs/source/reference/api/eland.DataFrame.groupby.rst b/docs/source/reference/api/eland.DataFrame.groupby.rst
@@ -0,0 +1,6 @@
+eland.DataFrame.groupby
+=======================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.groupby
diff --git a/docs/source/reference/dataframe.rst b/docs/source/reference/dataframe.rst
@@ -46,6 +46,7 @@ Function Application, GroupBy & Window
 
    DataFrame.agg
    DataFrame.aggregate
+   DataFrame.groupby
 
 .. _api.dataframe.stats:
 

diff --git a/eland/__init__.py b/eland/__init__.py
@@ -15,22 +15,22 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-from eland._version import (  # noqa: F401
-    __title__,
-    __description__,
-    __url__,
-    __version__,
+from ._version import (  # noqa: F401
     __author__,
     __author_email__,
+    __description__,
     __maintainer__,
     __maintainer_email__,
+    __title__,
+    __url__,
+    __version__,
 )
-from eland.common import SortOrder
-from eland.index import Index
-from eland.ndframe import NDFrame
-from eland.series import Series
-from eland.dataframe import DataFrame
-from eland.etl import pandas_to_eland, eland_to_pandas, read_es, read_csv, csv_to_eland
+from .common import SortOrder
+from .dataframe import DataFrame
+from .etl import csv_to_eland, eland_to_pandas, pandas_to_eland, read_csv, read_es
+from .index import Index
+from .ndframe import NDFrame
+from .series import Series
 
 __all__ = [
     "DataFrame",

diff --git a/eland/actions.py b/eland/actions.py
@@ -16,9 +16,9 @@
 #  under the License.
 
 from abc import ABC, abstractmethod
-from typing import List, Optional, TYPE_CHECKING, Union
-from eland import SortOrder
+from typing import TYPE_CHECKING, List, Optional, Union
 
+from eland import SortOrder
 
 if TYPE_CHECKING:
     import pandas as pd  # type: ignore

diff --git a/eland/arithmetics.py b/eland/arithmetics.py
@@ -17,7 +17,7 @@
 
 from abc import ABC, abstractmethod
 from io import StringIO
-from typing import Union, List, TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, List, Union
 
 import numpy as np  # type: ignore
 

diff --git a/eland/common.py b/eland/common.py
@@ -18,7 +18,7 @@
 import re
 import warnings
 from enum import Enum
-from typing import Union, List, Tuple, cast, Callable, Any, Optional, Dict
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore

diff --git a/eland/dataframe.py b/eland/dataframe.py
@@ -15,11 +15,11 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
+import re
 import sys
 import warnings
 from io import StringIO
-import re
-from typing import List, Optional, Sequence, Union, Tuple
+from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -34,12 +34,12 @@
 from pandas.util._validators import validate_bool_kwarg
 
 import eland.plotting as gfx
-from eland.ndframe import NDFrame
-from eland.series import Series
 from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter
 from eland.filter import BooleanFilter
-from eland.utils import deprecated_api, is_valid_attr_name
 from eland.groupby import GroupByDataFrame
+from eland.ndframe import NDFrame
+from eland.series import Series
+from eland.utils import deprecated_api, is_valid_attr_name
 
 
 class DataFrame(NDFrame):
@@ -1442,13 +1442,10 @@ def groupby(
         by:
             column or list of columns used to groupby
             Currently accepts column or list of columns
-            TODO Implement other combinations of by similar to pandas
 
         dropna: default True
             If True, and if group keys contain NA values, NA values together with row/column will be dropped.
-            TODO Implement False
 
-        TODO Implement remainder of pandas arguments
         Returns
         -------
         GroupByDataFrame
@@ -1495,18 +1492,18 @@ def groupby(
         [63 rows x 2 columns]
         """
         if by is None:
-            raise TypeError("by parameter should be specified to groupby")
+            raise ValueError("by parameter should be specified to groupby")
         if isinstance(by, str):
             by = [by]
         if isinstance(by, (list, tuple)):
-            remaining_columns = set(by) - set(self._query_compiler.columns)
+            remaining_columns = sorted(set(by) - set(self._query_compiler.columns))
             if remaining_columns:
                 raise KeyError(
-                    f"Requested columns {remaining_columns} not in the DataFrame."
+                    f"Requested columns {repr(remaining_columns)[1:-1]} not in the DataFrame"
                 )
 
         return GroupByDataFrame(
-            by=by, query_compiler=self._query_compiler, dropna=dropna
+            by=by, query_compiler=self._query_compiler.copy(), dropna=dropna
         )
 
     def query(self, expr) -> "DataFrame":

diff --git a/eland/etl.py b/eland/etl.py
@@ -16,17 +16,18 @@
 #  under the License.
 
 import csv
-from typing import Generator, Union, List, Tuple, Optional, Mapping, Dict, Any
 from collections import deque
+from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
+
 import pandas as pd  # type: ignore
+from elasticsearch import Elasticsearch  # type: ignore
+from elasticsearch.helpers import parallel_bulk  # type: ignore
 from pandas.io.parsers import _c_parser_defaults  # type: ignore
 
 from eland import DataFrame
+from eland.common import DEFAULT_CHUNK_SIZE, ensure_es_client
 from eland.field_mappings import FieldMappings, verify_mapping_compatibility
-from eland.common import ensure_es_client, DEFAULT_CHUNK_SIZE
 from eland.utils import deprecated_api
-from elasticsearch import Elasticsearch  # type: ignore
-from elasticsearch.helpers import parallel_bulk  # type: ignore
 
 
 @deprecated_api("eland.DataFrame()")

diff --git a/eland/field_mappings.py b/eland/field_mappings.py
@@ -16,31 +16,32 @@
 #  under the License.
 
 import warnings
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+)
 
 import numpy as np
 import pandas as pd
 from pandas.core.dtypes.common import (
-    is_float_dtype,
     is_bool_dtype,
-    is_integer_dtype,
     is_datetime_or_timedelta_dtype,
+    is_float_dtype,
+    is_integer_dtype,
     is_string_dtype,
 )
 from pandas.core.dtypes.inference import is_list_like
-from typing import (
-    NamedTuple,
-    Optional,
-    Mapping,
-    Dict,
-    Any,
-    Tuple,
-    TYPE_CHECKING,
-    List,
-    Set,
-)
 
 if TYPE_CHECKING:
     from elasticsearch import Elasticsearch
+
     from eland import DataFrame
 
 
@@ -64,7 +65,7 @@
 class Field(NamedTuple):
     """Holds all information on a particular field in the mapping"""
 
-    index: str
+    column: str
     es_field_name: str
     is_source: bool
     es_dtype: str
@@ -129,7 +130,7 @@ class FieldMappings:
     _mappings_capabilities: pandas.DataFrame
         A data frame summarising the capabilities of the index mapping
 
-        index                       - the eland display name
+        column (index)              - the eland display name
 
         es_field_name               - the Elasticsearch field name
         is_source                   - is top level field (i.e. not a multi-field sub-field)
@@ -537,13 +538,13 @@ def _generate_es_mappings(
         """
 
         mapping_props = {}
-        for field_name_name, dtype in dataframe.dtypes.iteritems():
-            if es_type_overrides is not None and field_name_name in es_type_overrides:
-                es_dtype = es_type_overrides[field_name_name]
+        for column, dtype in dataframe.dtypes.iteritems():
+            if es_type_overrides is not None and column in es_type_overrides:
+                es_dtype = es_type_overrides[column]
             else:
                 es_dtype = FieldMappings._pd_dtype_to_es_dtype(dtype)
 
-            mapping_props[field_name_name] = {"type": es_dtype}
+            mapping_props[column] = {"type": es_dtype}
 
         return {"mappings": {"properties": mapping_props}}
 
@@ -708,9 +709,9 @@ def all_source_fields(self) -> List[Field]:
 
         """
         source_fields: List[Field] = []
-        for index, row in self._mappings_capabilities.iterrows():
+        for column, row in self._mappings_capabilities.iterrows():
             row = row.to_dict()
-            row["index"] = index
+            row["column"] = column
             source_fields.append(Field(**row))
         return source_fields
 
@@ -731,13 +732,13 @@ def groupby_source_fields(self, by: List[str]) -> Tuple[List[Field], List[Field]
         groupby_fields: Dict[str, Field] = {}
         # groupby_fields: Union[List[Field], List[None]] = [None] * len(by)
         aggregatable_fields: List[Field] = []
-        for index_name, row in self._mappings_capabilities.iterrows():
+        for column, row in self._mappings_capabilities.iterrows():
             row = row.to_dict()
-            row["index"] = index_name
-            if index_name not in by:
+            row["column"] = column
+            if column not in by:
                 aggregatable_fields.append(Field(**row))
             else:
-                groupby_fields[index_name] = Field(**row)
+                groupby_fields[column] = Field(**row)
 
         # Maintain groupby order as given input
         return [groupby_fields[column] for column in by], aggregatable_fields

diff --git a/eland/filter.py b/eland/filter.py
@@ -17,7 +17,7 @@
 
 # Originally based on code in MIT-licensed pandasticsearch filters
 
-from typing import Dict, Any, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Union, cast
 
 
 class BooleanFilter: