Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix two likely causes of infinitely running evaluations #1840

Merged
merged 10 commits into from
Jul 5, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class EvaluationStatusEnum(str, Enum):
EVALUATION_FINISHED = "EVALUATION_FINISHED"
EVALUATION_FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS"
EVALUATION_FAILED = "EVALUATION_FAILED"
EVALUATION_AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED"


class EvaluationScenarioStatusEnum(str, Enum):
Expand Down
49 changes: 28 additions & 21 deletions agenta-backend/agenta_backend/services/aggregation_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,33 @@ def aggregate_ai_critique(results: List[Result]) -> Result:
Result: aggregated result
"""

numeric_scores = []
for result in results:
# Extract the first number found in the result value
match = re.search(r"\d+", result.value)
if match:
try:
score = int(match.group())
numeric_scores.append(score)
except ValueError:
# Ignore if the extracted value is not an integer
continue

# Calculate the average of numeric scores if any are present
average_value = (
sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
)
return Result(
type="number",
value=average_value,
)
try:
numeric_scores = []
for result in results:
# Extract the first number found in the result value
match = re.search(r"\d+", result.value)
if match:
try:
score = int(match.group())
numeric_scores.append(score)
except ValueError:
# Ignore if the extracted value is not an integer
continue

# Calculate the average of numeric scores if any are present
average_value = (
sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
)
return Result(
type="number",
value=average_value,
)
except Exception as exc:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


def aggregate_binary(results: List[Result]) -> Result:
Expand Down Expand Up @@ -71,7 +78,7 @@ def aggregate_float(results: List[Result]) -> Result:
return Result(
type="error",
value=None,
error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


Expand Down
70 changes: 48 additions & 22 deletions agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def evaluate(
type="status",
value="EVALUATION_FAILED",
error=Error(
message="Evaluation Failed !!!",
message="Evaluation Failed",
stacktrace=str(traceback.format_exc()),
),
)
Expand All @@ -339,35 +339,61 @@ def evaluate(
self.update_state(state=states.FAILURE)
return

aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
try:
aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
)

failed_evaluation_scenarios = loop.run_until_complete(
check_if_evaluation_contains_failed_evaluation_scenarios(new_evaluation_db.id)
)
loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
)
)

evaluation_status = Result(
type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
)
failed_evaluation_scenarios = loop.run_until_complete(
check_if_evaluation_contains_failed_evaluation_scenarios(
new_evaluation_db.id
)
)

if failed_evaluation_scenarios:
evaluation_status = Result(
type="status",
value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
error=None,
type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
)

loop.run_until_complete(
update_evaluation(
evaluation_id=new_evaluation_db.id, updates={"status": evaluation_status}
if failed_evaluation_scenarios:
evaluation_status = Result(
type="status",
value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
error=None,
)

loop.run_until_complete(
update_evaluation(
evaluation_id=new_evaluation_db.id,
updates={"status": evaluation_status},
)
)
)

except Exception as e:
logger.error(f"An error occurred during evaluation aggregation: {e}")
traceback.print_exc()
loop.run_until_complete(
update_evaluation(
evaluation_id,
{
"status": Result(
type="status",
value="EVALUATION_AGGREGATION_FAILED",
error=Error(
message="Evaluation Aggregation Failed",
stacktrace=str(traceback.format_exc()),
),
)
},
)
)
self.update_state(state=states.FAILURE)
return


async def aggregate_evaluator_results(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,28 +171,42 @@ export const ResultRenderer = React.memo(
)

export const runningStatuses = [EvaluationStatus.INITIALIZED, EvaluationStatus.STARTED]
export const statusMapper = (token: GlobalToken) => ({
[EvaluationStatus.INITIALIZED]: {
label: "Queued",
color: token.colorTextSecondary,
},
[EvaluationStatus.STARTED]: {
label: "Running",
color: token.colorWarning,
},
[EvaluationStatus.FINISHED]: {
label: "Completed",
color: token.colorSuccess,
},
[EvaluationStatus.ERROR]: {
label: "Failed",
color: token.colorError,
},
[EvaluationStatus.FINISHED_WITH_ERRORS]: {
label: "Completed with Errors",
color: token.colorWarning,
},
})
export const statusMapper = (token: GlobalToken) => (status: EvaluationStatus) => {
const statusMap = {
[EvaluationStatus.INITIALIZED]: {
label: "Queued",
color: token.colorTextSecondary,
},
[EvaluationStatus.STARTED]: {
label: "Running",
color: token.colorWarning,
},
[EvaluationStatus.FINISHED]: {
label: "Completed",
color: token.colorSuccess,
},
[EvaluationStatus.ERROR]: {
label: "Failed",
color: token.colorError,
},
[EvaluationStatus.FINISHED_WITH_ERRORS]: {
label: "Completed with Errors",
color: token.colorWarning,
},
[EvaluationStatus.AGGREGATION_FAILED]: {
label: "Result Aggregation Failed",
color: token.colorWarning,
},
}

return (
statusMap[status] || {
label: "Unknown",
color: "purple",
}
)
}

export const StatusRenderer = React.memo(
(params: ICellRendererParams<_Evaluation>) => {
const classes = useStyles()
Expand All @@ -201,7 +215,7 @@ export const StatusRenderer = React.memo(
params.data?.duration || 0,
runningStatuses.includes(params.value),
)
const {label, color} = statusMapper(token)[params.data?.status.value as EvaluationStatus]
const {label, color} = statusMapper(token)(params.data?.status.value as EvaluationStatus)
const errorMsg = params.data?.status.error?.message
const errorStacktrace = params.data?.status.error?.stacktrace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,10 @@ const EvaluationResults: React.FC<Props> = () => {
pinned: "right",
...getFilterParams("text"),
filterValueGetter: (params) =>
statusMapper(token)[params.data?.status.value as EvaluationStatus].label,
statusMapper(token)(params.data?.status.value as EvaluationStatus).label,
cellRenderer: StatusRenderer,
valueGetter: (params) =>
statusMapper(token)[params.data?.status.value as EvaluationStatus].label,
statusMapper(token)(params.data?.status.value as EvaluationStatus).label,
},
{
flex: 1,
Expand Down Expand Up @@ -406,7 +406,7 @@ const EvaluationResults: React.FC<Props> = () => {
"Avg. Latency": getTypedValue(item.average_latency),
"Total Cost": getTypedValue(item.average_cost),
Created: formatDate24(item.created_at),
Status: statusMapper(token)[item.status.value as EvaluationStatus].label,
Status: statusMapper(token)(item.status.value as EvaluationStatus).label,
})),
colDefs.map((col) => col.headerName!),
)
Expand Down Expand Up @@ -500,6 +500,8 @@ const EvaluationResults: React.FC<Props> = () => {
return
;(EvaluationStatus.FINISHED === params.data?.status.value ||
EvaluationStatus.FINISHED_WITH_ERRORS ===
params.data?.status.value ||
EvaluationStatus.AGGREGATION_FAILED ===
params.data?.status.value) &&
router.push(
`/apps/${appId}/evaluations/results/${params.data?.id}`,
Expand Down
1 change: 1 addition & 0 deletions agenta-web/src/lib/Types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ export enum EvaluationStatus {
FINISHED = "EVALUATION_FINISHED",
FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS",
ERROR = "EVALUATION_FAILED",
AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED",
}

export enum EvaluationStatusType {
Expand Down
Loading