-
Notifications
You must be signed in to change notification settings - Fork 16
Reduce # of clients connected to the application db [#810] #944
Changes from all commits
70d0f79
41a6800
feca30f
cf6d79b
c9e2ec4
88504cd
d38ef2b
b39c363
4f7f5f2
bf06332
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
import logging | ||
from datetime import datetime, timedelta | ||
from typing import Dict, List, Optional, Set | ||
from typing import ContextManager, Dict, List, Optional, Set | ||
|
||
from celery import Task | ||
from celery.utils.log import get_task_logger | ||
from fideslib.db.session import get_db_session | ||
from pydantic import ValidationError | ||
|
@@ -150,8 +151,22 @@ def queue_privacy_request( | |
return task.task_id | ||
|
||
|
||
@celery_app.task() | ||
class DatabaseTask(Task): # pylint: disable=W0223 | ||
_session = None | ||
|
||
@property | ||
def session(self) -> ContextManager[Session]: | ||
"""Creates Session once per process""" | ||
if self._session is None: | ||
SessionLocal = get_db_session(config) | ||
self._session = SessionLocal() | ||
|
||
return self._session | ||
|
||
|
||
@celery_app.task(base=DatabaseTask, bind=True) | ||
def run_privacy_request( | ||
self: DatabaseTask, | ||
privacy_request_id: str, | ||
from_webhook_id: Optional[str] = None, | ||
from_step: Optional[str] = None, | ||
|
@@ -169,8 +184,7 @@ def run_privacy_request( | |
# can't be passed into and between tasks | ||
from_step = PausedStep(from_step) # type: ignore | ||
|
||
SessionLocal = get_db_session(config) | ||
with SessionLocal() as session: | ||
with self.session as session: | ||
|
||
privacy_request = PrivacyRequest.get(db=session, object_id=privacy_request_id) | ||
if privacy_request.status == PrivacyRequestStatus.canceled: | ||
|
@@ -190,7 +204,6 @@ def run_privacy_request( | |
after_webhook_id=from_webhook_id, | ||
) | ||
if not proceed: | ||
session.close() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These |
||
return | ||
|
||
policy = privacy_request.policy | ||
|
@@ -217,6 +230,7 @@ def run_privacy_request( | |
graph=dataset_graph, | ||
connection_configs=connection_configs, | ||
identity=identity_data, | ||
session=session, | ||
) | ||
|
||
upload_access_results( | ||
|
@@ -238,19 +252,18 @@ def run_privacy_request( | |
access_request_data=get_cached_data_for_erasures( | ||
privacy_request.id | ||
), | ||
session=session, | ||
) | ||
|
||
except PrivacyRequestPaused as exc: | ||
privacy_request.pause_processing(session) | ||
_log_warning(exc, config.dev_mode) | ||
session.close() | ||
return | ||
|
||
except BaseException as exc: # pylint: disable=broad-except | ||
privacy_request.error_processing(db=session) | ||
# If dev mode, log traceback | ||
_log_exception(exc, config.dev_mode) | ||
session.close() | ||
return | ||
|
||
# Run post-execution webhooks | ||
|
@@ -260,14 +273,12 @@ def run_privacy_request( | |
webhook_cls=PolicyPostWebhook, # type: ignore | ||
) | ||
if not proceed: | ||
session.close() | ||
return | ||
|
||
privacy_request.finished_processing_at = datetime.utcnow() | ||
privacy_request.status = PrivacyRequestStatus.complete | ||
privacy_request.save(db=session) | ||
logging.info(f"Privacy request {privacy_request.id} run completed.") | ||
session.close() | ||
|
||
|
||
def initiate_paused_privacy_request_followup(privacy_request: PrivacyRequest) -> None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
|
||
import dask | ||
from dask.threaded import get | ||
from sqlalchemy.orm import Session | ||
|
||
from fidesops.common_exceptions import CollectionDisabled, PrivacyRequestPaused | ||
from fidesops.core.config import config | ||
|
@@ -568,10 +569,13 @@ def run_access_request( | |
graph: DatasetGraph, | ||
connection_configs: List[ConnectionConfig], | ||
identity: Dict[str, Any], | ||
session: Session, | ||
) -> Dict[str, List[Row]]: | ||
"""Run the access request""" | ||
traversal: Traversal = Traversal(graph, identity) | ||
with TaskResources(privacy_request, policy, connection_configs) as resources: | ||
with TaskResources( | ||
privacy_request, policy, connection_configs, session | ||
) as resources: | ||
Comment on lines
+576
to
+578
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're passing in the Session here to TaskResources so we can use it to create the ExecutionLogs |
||
|
||
def collect_tasks_fn( | ||
tn: TraversalNode, data: Dict[CollectionAddress, GraphTask] | ||
|
@@ -636,17 +640,20 @@ def update_erasure_mapping_from_cache( | |
) | ||
|
||
|
||
def run_erasure( # pylint: disable = too-many-arguments | ||
def run_erasure( # pylint: disable = too-many-arguments, too-many-locals | ||
privacy_request: PrivacyRequest, | ||
policy: Policy, | ||
graph: DatasetGraph, | ||
connection_configs: List[ConnectionConfig], | ||
identity: Dict[str, Any], | ||
access_request_data: Dict[str, List[Row]], | ||
session: Session, | ||
) -> Dict[str, int]: | ||
"""Run an erasure request""" | ||
traversal: Traversal = Traversal(graph, identity) | ||
with TaskResources(privacy_request, policy, connection_configs) as resources: | ||
with TaskResources( | ||
privacy_request, policy, connection_configs, session | ||
) as resources: | ||
|
||
def collect_tasks_fn( | ||
tn: TraversalNode, data: Dict[CollectionAddress, GraphTask] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,9 @@ | ||
import logging | ||
from typing import Any, Dict, List, Optional | ||
|
||
from fideslib.db.session import get_db_session | ||
from sqlalchemy.orm import Session | ||
|
||
from fidesops.common_exceptions import ConnectorNotFoundException | ||
from fidesops.core.config import config | ||
from fidesops.graph.config import CollectionAddress | ||
from fidesops.models.connectionconfig import ConnectionConfig, ConnectionType | ||
from fidesops.models.policy import ActionType, Policy | ||
|
@@ -97,6 +96,7 @@ def __init__( | |
request: PrivacyRequest, | ||
policy: Policy, | ||
connection_configs: List[ConnectionConfig], | ||
session: Session, | ||
): | ||
self.request = request | ||
self.policy = policy | ||
|
@@ -106,6 +106,7 @@ def __init__( | |
c.key: c for c in connection_configs | ||
} | ||
self.connections = Connections() | ||
self.session = session | ||
|
||
def __enter__(self) -> "TaskResources": | ||
"""Support 'with' usage for closing resources""" | ||
|
@@ -157,8 +158,7 @@ def write_execution_log( # pylint: disable=too-many-arguments | |
message: str = None, | ||
) -> Any: | ||
"""Store in application db. Return the created or written-to id field value.""" | ||
SessionLocal = get_db_session(config) | ||
db = SessionLocal() | ||
db = self.session | ||
Comment on lines
-160
to
+161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're using the same Session that is being used for the PrivacyRequest, not creating a new one, which reduces the number of connections we're opening. It also has a side effect of causing other resources bound to the Session for the PrivacyRequest to get the most recent state when the ExecutionLog is saved, because we commit the Session. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the main implication here is with request cancellation? ie. if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think this is a good example of where we can better make decisions mid-traversal based on the resources's current state in the future. |
||
|
||
ExecutionLog.create( | ||
db=db, | ||
|
@@ -172,7 +172,6 @@ def write_execution_log( # pylint: disable=too-many-arguments | |
"message": message, | ||
}, | ||
) | ||
db.close() | ||
|
||
def get_connector(self, key: FidesOpsKey) -> Any: | ||
"""Create or return the client corresponding to the given ConnectionConfig key""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,7 @@ def _create_celery(config_path: str = config.execution.celery_config_path) -> Ce | |
|
||
def start_worker() -> None: | ||
logger.info("Running Celery worker...") | ||
celery_app.worker_main(argv=["worker", "--loglevel=info"]) | ||
celery_app.worker_main(argv=["worker", "--loglevel=info", "--concurrency=2"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be configurable in the future, but this matches our state before the move to Celery. The default is the number of cores on your machine, but limiting the number of PrivacyRequests that can be run simultaneously per worker has a lot of positive effects. This includes reducing the number of simultaneous connections on the application database, as well as the customers' owned databases, and reduces simultaneous requests against their connected API's. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I've added a ticket to be addressed as a follow-up. |
||
|
||
|
||
if __name__ == "__main__": | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This allows us to share the Session across a Celery process.
This also limits the number of times
get_db_session
is being called per process which was creating new Engines which were opening up new Connection Pools whose connections weren't being reused.