Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better error message when experiment is terminated by user #10158

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions src/ert/run_models/base_run_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def delete_runpath(run_path: str) -> None:
shutil.rmtree(run_path)


class UserCancelled(Exception):
pass


class _LogAggregration(logging.Handler):
def __init__(self, messages: MutableSequence[str]) -> None:
self.messages = messages
Expand Down Expand Up @@ -568,10 +572,11 @@ async def run_monitor(
logger.debug(
"observed evaluation cancelled event, exit drainer"
)
# Allow track() to emit an EndEvent.
return False
raise UserCancelled(
"Experiment cancelled by user during evaluation"
)
elif type(event) is EETerminated:
logger.debug("got terminator event")
logger.debug("got terminated event")

if not self._end_queue.empty():
logger.debug("Run model canceled - during evaluation")
Expand All @@ -580,7 +585,9 @@ async def run_monitor(
logger.debug(
"Run model canceled - during evaluation - cancel sent"
)
except BaseException as e:
except UserCancelled:
raise
except Exception as e:
logger.exception(f"unexpected error: {e}")
# We really don't know what happened... shut down
# the thread and get out of here. The monitor has
Expand All @@ -598,7 +605,8 @@ async def run_ensemble_evaluator_async(
if not self._end_queue.empty():
logger.debug("Run model canceled - pre evaluation")
self._end_queue.get()
return []
raise UserCancelled("Experiment cancelled by user in pre evaluation")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have tests for these different scenarios? (cancelling pre evaluation, during evaluation, and post evaluation)


ee_ensemble = self._build_ensemble(run_args, ensemble.experiment_id)
evaluator = EnsembleEvaluator(
ee_ensemble,
Expand All @@ -619,8 +627,14 @@ async def run_ensemble_evaluator_async(
if not self._end_queue.empty():
logger.debug("Run model canceled - post evaluation")
self._end_queue.get()
await evaluator_task
return []
try:
await evaluator_task
except Exception as e:
raise Exception(
"Exception occured during user initiatied termination of experiment"
) from e
raise UserCancelled("Experiment cancelled by user in post evaluation")

await evaluator_task
ensemble.refresh_ensemble_state()

Expand All @@ -634,10 +648,9 @@ def run_ensemble_evaluator(
ensemble: Ensemble,
ee_config: EvaluatorServerConfig,
) -> list[int]:
successful_realizations = asyncio.run(
return asyncio.run(
self.run_ensemble_evaluator_async(run_args, ensemble, ee_config)
)
return successful_realizations

def _build_ensemble(
self,
Expand Down Expand Up @@ -757,11 +770,16 @@ def _evaluate_and_postprocess(
"run_paths": self.run_paths,
},
)
successful_realizations = self.run_ensemble_evaluator(
run_args,
ensemble,
evaluator_server_config,
)
try:
successful_realizations = self.run_ensemble_evaluator(
run_args,
ensemble,
evaluator_server_config,
)
except UserCancelled:
self.active_realizations = [False for _ in self.active_realizations]
raise

starting_realizations = [real.iens for real in run_args if real.active]
failed_realizations = list(
set(starting_realizations) - set(successful_realizations)
Expand Down