Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update errors #602

Merged
merged 2 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions rdt/errors.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
"""RDT Exceptions."""


class NotFittedError(Exception):
"""Error to raise when ``transform`` or ``reverse_transform`` are used before fitting."""
class ConfigNotSetError(Exception):
"""Error to use when no config has been set or detected."""


class InvalidConfigError(Exception):
"""Error to raise when something is incorrect about the config."""


class InvalidDataError(Exception):
"""Error to raise when the data is ill-formed in some way."""

class Error(Exception):
"""Error to raise when ``HyperTransformer`` produces a controlled error message."""

class NotFittedError(Exception):
"""Error to raise when ``transform`` or ``reverse_transform`` are used before fitting."""


class TransformerInputError(Exception):
"""Error to raise when ``HyperTransformer`` receives an incorrect input."""


class TransformerProcessingError(Exception):
"""Error to raise when transformer fails to complete some process (ie. anonymization)."""
60 changes: 33 additions & 27 deletions rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import pandas as pd

from rdt.errors import Error, NotFittedError, TransformerInputError
from rdt.errors import (
ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError, TransformerInputError,
TransformerProcessingError)
from rdt.transformers import (
BaseTransformer, get_class_by_transformer_name, get_default_transformer,
get_transformers_by_type)
Expand Down Expand Up @@ -166,7 +168,7 @@ def _validate_transformers(column_name_to_transformer):
invalid_transformers_columns.append(column_name)

if invalid_transformers_columns:
raise Error(
raise InvalidConfigError(
f'Invalid transformers for columns: {invalid_transformers_columns}. '
'Please assign an rdt transformer instance to each column name.'
)
Expand All @@ -190,23 +192,23 @@ def _validate_sdtypes(sdtypes):
unsupported_sdtypes.append(sdtype)

if unsupported_sdtypes:
raise Error(
raise InvalidConfigError(
f'Invalid sdtypes: {unsupported_sdtypes}. If you are trying to use a '
'premium sdtype, contact [email protected] about RDT Add-Ons.'
)

@staticmethod
def _validate_config(config):
if set(config.keys()) != {'sdtypes', 'transformers'}:
raise Error(
raise InvalidConfigError(
'Error: Invalid config. Please provide 2 dictionaries '
"named 'sdtypes' and 'transformers'."
)

sdtypes = config['sdtypes']
transformers = config['transformers']
if set(sdtypes.keys()) != set(transformers.keys()):
raise Error(
raise InvalidConfigError(
"The column names in the 'sdtypes' dictionary must match the "
"column names in the 'transformers' dictionary."
)
Expand All @@ -222,15 +224,15 @@ def _validate_config(config):
mismatched_columns.append(column_name)

if mismatched_columns:
raise Error(
raise InvalidConfigError(
"Some transformers you've assigned are not compatible with the sdtypes. "
f'Please change the following columns: {mismatched_columns}'
)

def _validate_update_columns(self, update_columns):
unknown_columns = self._subset(update_columns, self.field_sdtypes.keys(), not_in=True)
if unknown_columns:
raise Error(
raise InvalidConfigError(
f'Invalid column names: {unknown_columns}. These columns do not exist in the '
"config. Use 'set_config()' to write and set your entire config at once."
)
Expand All @@ -257,33 +259,37 @@ def set_config(self, config):
def _validate_update_transformers_by_sdtype(
self, sdtype, transformer, transformer_name, transformer_parameters):
if not self.field_sdtypes:
raise Error(
raise ConfigNotSetError(
'Nothing to update. Use the `detect_initial_config` method to '
'pre-populate all the sdtypes and transformers from your dataset.'
)

if transformer_name is None:
if transformer is None:
raise Error("Missing required parameter 'transformer_name'.")
raise InvalidConfigError("Missing required parameter 'transformer_name'.")

if not isinstance(transformer, BaseTransformer):
raise Error('Invalid transformer. Please input an rdt transformer object.')
raise InvalidConfigError(
'Invalid transformer. Please input an rdt transformer object.'
)

if sdtype not in transformer.get_supported_sdtypes():
raise Error("The transformer you've assigned is incompatible with the sdtype.")
raise InvalidConfigError(
"The transformer you've assigned is incompatible with the sdtype."
)

else:
if transformer_name not in get_class_by_transformer_name() or sdtype not in \
get_class_by_transformer_name()[transformer_name].get_supported_sdtypes():
raise Error(
raise InvalidConfigError(
f"Invalid transformer name '{transformer_name}' for the '{sdtype}' sdtype.")

if transformer_parameters is not None:
transformer = get_class_by_transformer_name()[transformer_name]
valid = inspect.signature(transformer).parameters
invalid_parameters = {arg for arg in transformer_parameters if arg not in valid}
if invalid_parameters:
raise Error(
raise TransformerInputError(
f'Invalid parameters {tuple(sorted(invalid_parameters))} '
f"for the '{transformer_name}'."
)
Expand Down Expand Up @@ -355,7 +361,7 @@ def update_sdtypes(self, column_name_to_sdtype):
Dict mapping column names to ``sdtypes`` for that column.
"""
if len(self.field_sdtypes) == 0:
raise Error(self._DETECT_CONFIG_MESSAGE)
raise ConfigNotSetError(self._DETECT_CONFIG_MESSAGE)

update_columns = column_name_to_sdtype.keys()
self._validate_update_columns(update_columns)
Expand Down Expand Up @@ -394,7 +400,7 @@ def update_transformers(self, column_name_to_transformer):
warnings.warn(self._REFIT_MESSAGE)

if len(self.field_transformers) == 0:
raise Error(self._DETECT_CONFIG_MESSAGE)
raise ConfigNotSetError(self._DETECT_CONFIG_MESSAGE)

update_columns = column_name_to_transformer.keys()
self._validate_update_columns(update_columns)
Expand All @@ -404,7 +410,7 @@ def update_transformers(self, column_name_to_transformer):
if transformer is not None:
current_sdtype = self.field_sdtypes.get(column_name)
if current_sdtype and current_sdtype not in transformer.get_supported_sdtypes():
raise TransformerInputError(
raise InvalidConfigError(
f"Column '{column_name}' is a {current_sdtype} column, which is "
f"incompatible with the '{transformer.get_name()}' transformer."
)
Expand All @@ -429,7 +435,7 @@ def remove_transformers(self, column_names):
unknown_columns.append(column_name)

if unknown_columns:
raise Error(
raise InvalidConfigError(
f'Invalid column names: {unknown_columns}. These columns do not exist in the '
"config. Use 'get_config()' to see the expected values."
)
Expand All @@ -451,7 +457,7 @@ def remove_transformers_by_sdtype(self, sdtype):
Semantic data type for the transformers to be removed.
"""
if sdtype not in self._get_supported_sdtypes():
raise Error(
raise InvalidConfigError(
f"Invalid sdtype '{sdtype}'. If you are trying to use a premium sdtype, "
'contact [email protected] about RDT Add-Ons.'
)
Expand Down Expand Up @@ -569,7 +575,7 @@ def _validate_all_fields_fitted(self):

def _validate_config_exists(self):
if len(self.field_sdtypes) == 0 and len(self.field_transformers) == 0:
raise Error(
raise ConfigNotSetError(
"No config detected. Set the config using 'set_config' or pre-populate "
"it automatically from your data using 'detect_initial_config' prior to "
'fitting your data.'
Expand All @@ -583,7 +589,7 @@ def _validate_detect_config_called(self, data):
unknown_columns = self._subset(data.columns, fields, not_in=True)
if unknown_columns or missing:
unknown_text = f' (unknown columns: {unknown_columns})' if unknown_columns else ''
raise Error(
raise InvalidDataError(
'The data you are trying to fit has different columns than the original '
f'detected data{unknown_text}. Column names and their '
"sdtypes must be the same. Use the method 'get_config()' to see the expected "
Expand Down Expand Up @@ -620,14 +626,14 @@ def _transform(self, data, prevent_subset):
contained = all(column in self._input_columns for column in data.columns)
is_subset = contained and len(data.columns) < len(self._input_columns)
if unknown_columns or is_subset:
raise Error(
raise InvalidDataError(
'The data you are trying to transform has different columns than the original '
'data. Column names and their sdtypes must be the same. Use the method '
"'get_config()' to see the expected values."
)

elif unknown_columns:
raise Error(
raise InvalidDataError(
'Unexpected column names in the data you are trying to transform: '
f"{unknown_columns}. Use 'get_config()' to see the acceptable column names."
)
Expand Down Expand Up @@ -702,11 +708,11 @@ def create_anonymized_columns(self, num_rows, column_names):
self._validate_fitted()

if not isinstance(num_rows, int) or num_rows <= 0:
raise Error("Parameter 'num_rows' must be an integer greater than 0.")
raise ValueError("Parameter 'num_rows' must be an integer greater than 0.")

unknown_columns = self._subset(column_names, self._input_columns, not_in=True)
if unknown_columns:
raise Error(
raise InvalidConfigError(
f"Unknown column name {unknown_columns}. Use 'get_config()' to see a "
'list of valid column names.'
)
Expand All @@ -715,7 +721,7 @@ def create_anonymized_columns(self, num_rows, column_names):
for column_name in column_names:
transformer = self.field_transformers.get(column_name)
if not transformer.is_generator():
raise Error(
raise TransformerProcessingError(
f"Column '{column_name}' cannot be anonymized. All columns must be assigned "
"to 'AnonymizedFaker', 'RegexGenerator' or other ``generator``. Use "
"'get_config()' to see the current transformer assignments."
Expand All @@ -735,7 +741,7 @@ def _reverse_transform(self, data, prevent_subset):

unknown_columns = self._subset(data.columns, self._output_columns, not_in=True)
if unknown_columns:
raise Error(
raise InvalidDataError(
'There are unexpected column names in the data you are trying to transform. '
f'A reverse transform is not defined for {unknown_columns}.'
)
Expand All @@ -744,7 +750,7 @@ def _reverse_transform(self, data, prevent_subset):
contained = all(column in self._output_columns for column in data.columns)
is_subset = contained and len(data.columns) < len(self._output_columns)
if is_subset:
raise Error(
raise InvalidDataError(
'You must provide a transformed dataset with all the columns from the '
'original data.'
)
Expand Down
14 changes: 9 additions & 5 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import psutil
from scipy.stats import norm

from rdt.errors import Error
from rdt.errors import TransformerInputError
from rdt.transformers.base import BaseTransformer


Expand Down Expand Up @@ -444,7 +444,7 @@ def __init__(self, add_noise=False, order_by=None):
super().__init__()
self.add_noise = add_noise
if order_by not in [None, 'alphabetical', 'numerical_value']:
raise Error(
raise TransformerInputError(
"order_by must be one of the following values: None, 'numerical_value' or "
"'alphabetical'"
)
Expand All @@ -454,11 +454,15 @@ def __init__(self, add_noise=False, order_by=None):
def _order_categories(self, unique_data):
if self.order_by == 'alphabetical':
if unique_data.dtype.type not in [np.str_, np.object_]:
raise Error("The data must be of type string if order_by is 'alphabetical'.")
raise TransformerInputError(
"The data must be of type string if order_by is 'alphabetical'."
)

elif self.order_by == 'numerical_value':
if not np.issubdtype(unique_data.dtype.type, np.number):
raise Error("The data must be numerical if order_by is 'numerical_value'.")
raise TransformerInputError(
"The data must be numerical if order_by is 'numerical_value'."
)

if self.order_by is not None:
nans = pd.isna(unique_data)
Expand Down Expand Up @@ -576,7 +580,7 @@ def _fit(self, data):
data = data.fillna(np.nan)
missing = list(data[~data.isin(self.order)].unique())
if len(missing) > 0:
raise Error(
raise TransformerInputError(
f"Unknown categories '{missing}'. All possible categories must be defined in the "
"'order' parameter."
)
Expand Down
16 changes: 8 additions & 8 deletions rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import faker
import numpy as np

from rdt.errors import Error
from rdt.errors import TransformerInputError, TransformerProcessingError
from rdt.transformers.base import BaseTransformer
from rdt.transformers.categorical import LabelEncoder

Expand All @@ -32,7 +32,7 @@ class AnonymizedFaker(BaseTransformer):
List of localized providers to use instead of the global provider.
enforce_uniqueness (bool):
Whether or not to ensure that the new anonymized data is all unique. If it isn't
possible to create the requested number of rows, then an ``Error`` will be raised.
possible to create the requested number of rows, then an error will be raised.
Defaults to ``False``.
"""

Expand Down Expand Up @@ -60,7 +60,7 @@ def check_provider_function(provider_name, function_name):
getattr(provider, function_name)

except AttributeError as exception:
raise Error(
raise TransformerProcessingError(
f"The '{provider_name}' module does not contain a function named "
f"'{function_name}'.\nRefer to the Faker docs to find the correct function: "
'https://faker.readthedocs.io/en/master/providers.html'
Expand Down Expand Up @@ -90,7 +90,7 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
self.enforce_uniqueness = enforce_uniqueness
self.provider_name = provider_name if provider_name else 'BaseProvider'
if self.provider_name != 'BaseProvider' and function_name is None:
raise Error(
raise TransformerInputError(
'Please specify the function name to use from the '
f"'{self.provider_name}' provider."
)
Expand Down Expand Up @@ -153,7 +153,7 @@ def _reverse_transform(self, data):
for _ in range(sample_size)
], dtype=object)
except faker.exceptions.UniquenessException as exception:
raise Error(
raise TransformerProcessingError(
f'The Faker function you specified is not able to generate {sample_size} unique '
'values. Please use a different Faker function for column '
f"('{self.get_input_column()}')."
Expand Down Expand Up @@ -249,7 +249,7 @@ def _fit(self, columns_data):
try:
generated_values = [self._function() for _ in range(unique_data_length)]
except faker.exceptions.UniquenessException as exception:
raise Error(
raise TransformerProcessingError(
'The Faker function you specified is not able to generate '
f'{unique_data_length} unique values. Please use a different '
'Faker function for this column.'
Expand All @@ -263,7 +263,7 @@ def _transform(self, columns_data):
"""Replace each category with a numerical representation.

Map the input ``columns_data`` using the previously generated values for each one.
If the ``columns_data`` contain unknown values, a ``Error`` will be raised with the
If the ``columns_data`` contain unknown values, an error will be raised with the
unknown categories.

Args:
Expand Down Expand Up @@ -293,7 +293,7 @@ def _transform(self, columns_data):
'using this new data.'
)

raise Error(error_msg)
raise TransformerProcessingError(error_msg)

mapped_data = columns_data.map(self._mapping_dict)
return mapped_data
Expand Down
Loading