Skip to content

Commit

Permalink
RC/FC: fixed potential deadlock
Browse files Browse the repository at this point in the history
- in some cases reply to FC_HARD_REQ could not be sent immediately due
  to lack of HW resources, in this case request is pushed into arbiter.
  But in case if peer is falled into same situation - it could cause
  deadlock.
- fix: add FC grand request with high priority to send it out-of-order
  • Loading branch information
Sergey Oblomov committed Apr 7, 2019
1 parent 3c65f18 commit 0e3f71a
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 12 deletions.
9 changes: 8 additions & 1 deletion src/ucs/datastruct/arbiter.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,14 @@ void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_t *arbiter,

if (head->list.next != NULL) {
ucs_assert(arbiter != NULL);
ucs_arbiter_group_head_replaced(arbiter, head, elem);
if (ucs_list_prev(&head->list, ucs_arbiter_elem_t, list) == head) {
arbiter->current = elem;
ucs_list_head_init(&elem->list);
} else {
ucs_arbiter_group_head_replaced(arbiter, head, elem);
}
} else {
elem->list.next = NULL; /* Mark the new head as un-scheduled */
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/uct/api/uct.h
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ struct uct_completion {
* @ingroup UCT_RESOURCE
* @brief Pending request.
*
* This structure should be passed to uct_pending_add() and is used to signal
* This structure should be passed to @ref uct_ep_pending_add() and is used to signal
* new available resources back to user.
*/
struct uct_pending_req {
Expand Down
11 changes: 11 additions & 0 deletions src/uct/base/uct_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,17 @@ uct_pending_req_priv_arb_elem(uct_pending_req_t *req)
} while (0)


/**
* Add a pending request to the head of group in arbiter.
*/
#define uct_pending_req_arb_group_push_head(_arbiter, _arbiter_group, _req) \
do { \
ucs_arbiter_elem_init(uct_pending_req_priv_arb_elem(_req)); \
ucs_arbiter_group_push_head_elem(_arbiter, _arbiter_group, \
uct_pending_req_priv_arb_elem(_req)); \
} while (0)


/**
* Base structure for private data held inside a pending request for TLs
* which use ucs_queue_t to progress pending requests.
Expand Down
14 changes: 7 additions & 7 deletions src/uct/ib/rc/base/rc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -310,15 +310,15 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter,
return UCS_ARBITER_CB_RESULT_NEXT_GROUP;
} else {
ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_rc_ep_t, arb_group);
if (! uct_rc_ep_has_tx_resources(ep)) {
/* No ep resources */
return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
} else {
/* No iface resources */
iface = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t);
ucs_assertv(!uct_rc_iface_has_tx_resources(iface),
"pending callback returned error but send resources are available");
if (!uct_rc_iface_has_tx_resources(iface)) {
/* No iface resources */
return UCS_ARBITER_CB_RESULT_STOP;
} else {
/* No ep resources */
ucs_assertv(!uct_rc_ep_has_tx_resources(ep),
"pending callback returned error but send resources are available");
return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
}
}
}
Expand Down
11 changes: 8 additions & 3 deletions src/uct/ib/rc/base/rc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -429,10 +429,15 @@ ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num,
status = uct_rc_ep_fc_grant(&fc_req->super);

if (status == UCS_ERR_NO_RESOURCE){
status = uct_ep_pending_add(&ep->super.super, &fc_req->super, 0);
/* force add request to group & schedule group to eliminate
* FC deadlock */
uct_pending_req_arb_group_push_head(&iface->tx.arbiter,
&ep->arb_group, &fc_req->super);
ucs_arbiter_group_schedule(&iface->tx.arbiter, &ep->arb_group);
} else {
ucs_assertv_always(status == UCS_OK, "Failed to send FC grant msg: %s",
ucs_status_string(status));
}
ucs_assertv_always(status == UCS_OK, "Failed to send FC grant msg: %s",
ucs_status_string(status));
}

return uct_iface_invoke_am(&iface->super.super,
Expand Down

0 comments on commit 0e3f71a

Please sign in to comment.