Skip to content
This repository was archived by the owner on Sep 12, 2024. It is now read-only.

DeepDrills Decoupling and Restructuring #967

Merged
merged 7 commits into from
May 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ TIMEZONE=UTC
# Synctime for your metadata
METADATA_SYNC_TIME=03:00

### KPI Configuration
# Sets the maximum number of rows allowed for a KPI to be added.
MAX_ROWS_IN_KPI=10000000

### Anomaly Configuration
# Enables the generation of multi-dimensional subgroups.
MULTIDIM_ANALYSIS_FOR_ANOMALY=False
Expand All @@ -32,19 +36,19 @@ MAX_FILTER_SUBGROUPS_ANOMALY=250
# Sets the maximum number of days for which we can have no data and still consider the KPI for Anomaly Detection.
MAX_ANOMALY_SLACK_DAYS=14

### DeepDrills Configuration
# Sets the maximum number of rows allowed for a KPI to be added.
MAX_ROWS_FOR_DEEPDRILLS=10000000
# Sets the maximum number of days for which we can have no data and still consider the KPI for DeepDrills.
MAX_DEEPDRILLS_SLACK_DAYS=14
### Summary and DeepDrills Configuration
# Sets the maximum number of days for which we can have no data and still consider the KPI for Summary and DeepDrills.
MAX_SUMMARY_DEEPDRILLS_SLACK_DAYS=14
# Sets the enabled time ranges for which Summary and DeepDrills is computed as comma separated values.
SUMMARY_DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day,month_on_month,month_to_date,week_on_week,week_to_date
# Enables or disables DeepDrills.
DEEPDRILLS_ENABLED=False
# Sets the maximum number of rows in the first level of the DeepDrills' drilldowns.
DEEPDRILLS_HTABLE_MAX_PARENTS=5
# Sets the maximum number of rows in the subsequent levels of the DeepDrills' drilldowns.
DEEPDRILLS_HTABLE_MAX_CHILDREN=5
# Sets the maximum depth of the drilldowns in DeepDrills.
DEEPDRILLS_HTABLE_MAX_DEPTH=3
# Sets the enabled time ranges for which DeepDrills is computed as comma separated values.
DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day,month_on_month,month_to_date,week_on_week,week_to_date

## Sentry Logging (leave empty to disable backend telemetry)
SENTRY_DSN=
Expand Down
7 changes: 4 additions & 3 deletions .env.local.template
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ MAX_SUBDIM_CARDINALITY=1000
TOP_DIMENSIONS_FOR_ANOMALY_DRILLDOWN=10
MIN_DATA_IN_SUBGROUP=30
TOP_SUBDIMENSIONS_FOR_ANOMALY=10
MAX_ROWS_FOR_DEEPDRILLS=10000000
MAX_ROWS_IN_KPI=10000000
MAX_FILTER_SUBGROUPS_ANOMALY=250
MAX_DEEPDRILLS_SLACK_DAYS=14
MAX_SUMMARY_DEEPDRILLS_SLACK_DAYS=14
MAX_ANOMALY_SLACK_DAYS=14
DAYS_OFFSET_FOR_ANALTYICS=2

SUMMARY_DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day
DEEPDRILLS_ENABLED=False
DEEPDRILLS_HTABLE_MAX_PARENTS=5
DEEPDRILLS_HTABLE_MAX_CHILDREN=5
DEEPDRILLS_HTABLE_MAX_DEPTH=3
DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day
2 changes: 2 additions & 0 deletions chaos_genius/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
status_view,
digest_view,
rca_view,
summary_view
)
from chaos_genius.extensions import (
bcrypt,
Expand Down Expand Up @@ -91,6 +92,7 @@ def register_blueprints(app):
app.register_blueprint(meta_view.blueprint, url_prefix='/api/meta')
app.register_blueprint(digest_view.blueprint, url_prefix='/api/digest')
app.register_blueprint(rca_view.blueprint, url_prefix='/api/rca')
app.register_blueprint(summary_view.blueprint, url_prefix='/api/summary')
return None


Expand Down
62 changes: 46 additions & 16 deletions chaos_genius/controllers/kpi_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,34 @@

from sqlalchemy import delete

from chaos_genius.controllers.task_monitor import checkpoint_failure, checkpoint_success
from chaos_genius.controllers.task_monitor import (
checkpoint_failure,
checkpoint_success,
)
from chaos_genius.core.anomaly.controller import AnomalyDetectionController
from chaos_genius.core.rca.constants import TIME_RANGES_BY_KEY
from chaos_genius.core.rca.rca_controller import RootCauseAnalysisController
from chaos_genius.core.utils.data_loader import DataLoader
from chaos_genius.core.utils.round import round_number
from chaos_genius.databases.models.anomaly_data_model import AnomalyDataOutput
from chaos_genius.databases.models.kpi_model import Kpi
from chaos_genius.databases.models.rca_data_model import RcaData
from chaos_genius.extensions import db
from chaos_genius.settings import DAYS_OFFSET_FOR_ANALTYICS, MAX_DEEPDRILLS_SLACK_DAYS
from chaos_genius.settings import (
DAYS_OFFSET_FOR_ANALTYICS,
MAX_SUMMARY_DEEPDRILLS_SLACK_DAYS,
)

logger = logging.getLogger(__name__)


def _is_data_present_for_end_date(kpi_info: dict, end_date: date = None) -> bool:
def _is_data_present_for_end_date(
kpi_info: dict, end_date: date = None
) -> bool:
if end_date is None:
end_date = datetime.now().date()
df_count = DataLoader(kpi_info, end_date=end_date, days_before=0).get_count()
df_count = DataLoader(
kpi_info, end_date=end_date, days_before=0
).get_count()
return df_count != 0


Expand Down Expand Up @@ -56,16 +65,22 @@ def run_anomaly_for_kpi(

logger.info("(KPI ID: {kpi_id}) Selecting end date.")

if end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "D":
if (
end_date is None
and kpi_info["scheduler_params"]["scheduler_frequency"] == "D"
):
# by default we always calculate for n-days_offset_for_analytics
end_date = datetime.today().date() - timedelta(days=(DAYS_OFFSET_FOR_ANALTYICS))
end_date = datetime.today().date() - timedelta(
days=(DAYS_OFFSET_FOR_ANALTYICS)
)
# Check if data is available or not then try for n-days_offset_for_analytics-1
if not _is_data_present_for_end_date(kpi_info, end_date):
end_date = end_date - timedelta(days=1)
logger.info("(KPI ID: {kpi_id}) Decreasing end date by 1.")

elif (
end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "H"
end_date is None
and kpi_info["scheduler_params"]["scheduler_frequency"] == "H"
):
end_date = datetime.today().date()

Expand All @@ -79,16 +94,19 @@ def run_anomaly_for_kpi(
def _get_end_date_for_rca_kpi(kpi_info: dict, end_date: date = None) -> date:
# by default we always calculate for n-1
if end_date is None:
end_date = datetime.today().date() - timedelta(days=(DAYS_OFFSET_FOR_ANALTYICS))
end_date = datetime.today().date() - timedelta(
days=(DAYS_OFFSET_FOR_ANALTYICS)
)

count = 0
while not _is_data_present_for_end_date(kpi_info, end_date):
logger.info(f"Checking for end date: {end_date}.")
end_date = end_date - timedelta(days=1)
count += 1
if count > MAX_DEEPDRILLS_SLACK_DAYS:
if count > MAX_SUMMARY_DEEPDRILLS_SLACK_DAYS:
raise ValueError(
f"KPI has no data for the last {MAX_DEEPDRILLS_SLACK_DAYS} days."
f"KPI has no data for the last "
f"{MAX_SUMMARY_DEEPDRILLS_SLACK_DAYS} days."
)

return end_date
Expand All @@ -113,10 +131,16 @@ def run_rca_for_kpi(
f"(Task: {task_id}, KPI: {kpi_id}) DeepDrills - Data Loader and Validation - Success",
)
except Exception as e: # noqa: B902
logger.error(f"Getting end date failed for KPI: {kpi_id}.", exc_info=e)
logger.error(
f"Getting end date failed for KPI: {kpi_id}.", exc_info=e
)
if task_id is not None:
checkpoint_failure(
task_id, kpi_id, "DeepDrills", "Data Loader and Validation", e
task_id,
kpi_id,
"DeepDrills",
"Data Loader and Validation",
e,
)
logger.error(
f"(Task: {task_id}, KPI: {kpi_id}) DeepDrills - Data Loader and Validation - Exception occured.",
Expand All @@ -131,13 +155,19 @@ def run_rca_for_kpi(
logger.info(f"Completed RCA for KPI ID: {kpi_id}.")

except Exception as e: # noqa: B902
logger.error(f"RCA encountered an error for KPI ID: {kpi_id}", exc_info=e)
logger.error(
f"RCA encountered an error for KPI ID: {kpi_id}", exc_info=e
)
if task_id is not None:
checkpoint_failure(task_id, kpi_id, "DeepDrills", "DeepDrills complete", e)
checkpoint_failure(
task_id, kpi_id, "DeepDrills", "DeepDrills complete", e
)
return False

if task_id is not None:
checkpoint_success(task_id, kpi_id, "DeepDrills", "DeepDrills complete")
checkpoint_success(
task_id, kpi_id, "DeepDrills", "DeepDrills complete"
)

return True

Expand Down
26 changes: 19 additions & 7 deletions chaos_genius/core/rca/rca_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
import pandas as pd
from numpyencoder import NumpyEncoder

from chaos_genius.controllers.task_monitor import checkpoint_failure, checkpoint_success
from chaos_genius.controllers.task_monitor import (
checkpoint_failure,
checkpoint_success,
)
from chaos_genius.core.rca.constants import (
LINE_DATA_TIMESTAMP_FORMAT,
TIME_RANGES_BY_KEY,
Expand All @@ -21,10 +24,11 @@
from chaos_genius.databases.models.data_source_model import DataSource
from chaos_genius.databases.models.rca_data_model import RcaData, db
from chaos_genius.settings import (
DEEPDRILLS_ENABLED_TIME_RANGES,
DEEPDRILLS_ENABLED,
DEEPDRILLS_HTABLE_MAX_CHILDREN,
DEEPDRILLS_HTABLE_MAX_DEPTH,
DEEPDRILLS_HTABLE_MAX_PARENTS,
SUMMARY_DEEPDRILLS_ENABLED_TIME_RANGES,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,9 +57,9 @@ def __init__(
self.kpi_info = kpi_info

# TODO: Make this connection type agnostic.
conn_type = DataSource.get_by_id(
kpi_info["data_source"]
).as_dict["connection_type"]
conn_type = DataSource.get_by_id(kpi_info["data_source"]).as_dict[
"connection_type"
]
self._preaggregated = conn_type == "Druid"
self._preaggregated_count_col = "count"

Expand Down Expand Up @@ -359,7 +363,7 @@ def compute(self):
raise e
logger.info("Line Data for KPI completed.")

for timeline in DEEPDRILLS_ENABLED_TIME_RANGES:
for timeline in SUMMARY_DEEPDRILLS_ENABLED_TIME_RANGES:
logger.info(f"Running RCA for timeline: {timeline}.")
try:
rca = self._load_rca_obj(timeline)
Expand Down Expand Up @@ -390,6 +394,14 @@ def compute(self):
self._checkpoint_failure(f"{timeline} Card Metrics", e)
continue

# Do not calculate DeepDrills if DEEPDRILLS_ENABLED is false.
if not DEEPDRILLS_ENABLED:
logger.info(
"DEEPDRILLS_ENABLED is False. Skipping DeepDrills."
)
self._checkpoint_success(f"{timeline} DeepDrills Calculation")
continue

# Do not calculate further if no dimensions are present
if not self.kpi_info.get("dimensions"):
logger.info(
Expand Down Expand Up @@ -457,7 +469,7 @@ def compute(self):
db.engine,
if_exists="append",
index=False,
chunksize=RcaData.__chunksize__
chunksize=RcaData.__chunksize__,
)
self._checkpoint_success("Output Storage")
except Exception as e: # noqa E722
Expand Down
18 changes: 17 additions & 1 deletion chaos_genius/core/rca/rca_utils/api_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Utility functions for RCA API endpoints."""
import logging
from datetime import date, datetime
from datetime import date, datetime, timedelta
from typing import List
from chaos_genius.databases.models.anomaly_data_model import AnomalyDataOutput

from chaos_genius.extensions import db
from chaos_genius.controllers.kpi_controller import get_kpi_data_from_id
Expand Down Expand Up @@ -38,6 +39,16 @@ def kpi_aggregation(kpi_id, timeline="last_30_days"):
.first()
)

rca_end_date = data_point.end_date

anomaly_data_point = AnomalyDataOutput.query.filter(
(AnomalyDataOutput.kpi_id == kpi_id)
& (AnomalyDataOutput.anomaly_type == "overall")
& (AnomalyDataOutput.is_anomaly != 0)
& (AnomalyDataOutput.data_datetime <= rca_end_date + timedelta(days=1))
& (AnomalyDataOutput.data_datetime >= rca_end_date - timedelta(days=7))
).count()

if data_point:
analysis_date = get_analysis_date(kpi_id, end_date)
final_data = {
Expand All @@ -58,12 +69,17 @@ def kpi_aggregation(kpi_id, timeline="last_30_days"):
"label": "perc_change",
"value": data_point.data["perc_change"],
},
{
"label": "anomalous_points",
"value": anomaly_data_point,
},
],
"analysis_date": get_datetime_string_with_tz(analysis_date),
"timecuts_date": get_timecuts_dates(analysis_date, timeline),
"last_run_time_rca": get_lastscan_string_with_tz(
kpi_info["scheduler_params"]["last_scheduled_time_rca"]
),
"anomalous_points_str": "Last 7 Days",
}
else:
raise ValueError("No data found")
Expand Down
Loading