Skip to content

Commit fc09fbe

Browse files
authored
DAOS-14598 object: correct epoch for parity migration (#13453)
Use stable epoch for partial parity update to make sure these partial updates are not below stable epoch boundary, otherwise both EC and VOS aggregation might operate on the same recxs at the same time, which can corrupt the data during rebuild. During EC aggregation, it should consider the un-aggregate epoch on non-leader parity as well, otherwise if the leader parity failed, which will be excluded from global EC stable epoch calculation immediately, then before the leader parity is being rebuilt, the global stable epoch might pass the un-aggregated epoch on the failed target, then these partial update on the data shard might be aggregated before EC aggregation, which might cause data corruption. And also it should choose a less fseq shard among all parity shards as the aggregate leader, in case the last parity can not be rebuilt in time. Signed-off-by: Di Wang <[email protected]>
1 parent a24ca12 commit fc09fbe

File tree

7 files changed

+109
-56
lines changed

7 files changed

+109
-56
lines changed

src/container/srv_target.c

+12
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,18 @@ ds_cont_child_stop_all(struct ds_pool_child *pool_child)
850850
}
851851
}
852852

853+
void
854+
ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child)
855+
{
856+
struct ds_cont_child *cont_child;
857+
858+
D_DEBUG(DB_MD, DF_UUID"[%d]: reset all containers EC aggregate epoch.\n",
859+
DP_UUID(pool_child->spc_uuid), dss_get_module_info()->dmi_tgt_id);
860+
861+
d_list_for_each_entry(cont_child, &pool_child->spc_cont_list, sc_link)
862+
cont_child->sc_ec_agg_eph = cont_child->sc_ec_agg_eph_boundary;
863+
}
864+
853865
static int
854866
cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
855867
bool *started, struct ds_cont_child **cont_out)

src/include/daos_srv/container.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,8 @@ void ds_cont_child_stop_all(struct ds_pool_child *pool_child);
190190

191191
int ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid,
192192
struct ds_cont_child **ds_cont);
193-
193+
void
194+
ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child);
194195
/** initialize a csummer based on container properties. Will retrieve the
195196
* checksum related properties from IV
196197
*/

src/object/srv_ec_aggregate.c

+82-51
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright 2020-2023 Intel Corporation.
2+
* (C) Copyright 2020-2024 Intel Corporation.
33
*
44
* SPDX-License-Identifier: BSD-2-Clause-Patent
55
*/
@@ -89,6 +89,7 @@ struct ec_agg_par_extent {
8989
struct ec_agg_stripe {
9090
daos_off_t as_stripenum; /* ordinal of stripe, offset/(k*len) */
9191
daos_epoch_t as_hi_epoch; /* highest epoch in stripe */
92+
daos_epoch_t as_lo_epoch; /* lowest epoch in stripe */
9293
d_list_t as_dextents; /* list of stripe's data extents */
9394
daos_off_t as_stripe_fill; /* amount of stripe covered by data */
9495
uint64_t as_offset; /* start offset in stripe */
@@ -114,6 +115,7 @@ struct ec_agg_entry {
114115
struct pl_obj_layout *ae_obj_layout;
115116
struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P];
116117
uint32_t ae_grp_idx;
118+
uint32_t ae_is_leader:1;
117119
};
118120

119121
/* Parameters used to drive iterate all.
@@ -123,13 +125,13 @@ struct ec_agg_param {
123125
struct ec_agg_entry ap_agg_entry; /* entry used for each OID */
124126
daos_epoch_range_t ap_epr; /* hi/lo extent threshold */
125127
daos_epoch_t ap_filter_eph; /* Aggregatable filter epoch */
128+
daos_epoch_t ap_min_unagg_eph; /* minimum unaggregate epoch */
126129
daos_handle_t ap_cont_handle; /* VOS container handle */
127130
int (*ap_yield_func)(void *arg); /* yield function*/
128131
void *ap_yield_arg; /* yield argument */
129132
uint32_t ap_credits_max; /* # of tight loops to yield */
130133
uint32_t ap_credits; /* # of tight loops */
131-
uint32_t ap_initialized:1, /* initialized flag */
132-
ap_obj_skipped:1; /* skipped obj during aggregation */
134+
uint32_t ap_initialized:1; /* initialized flag */
133135
};
134136

135137
/* Struct used to drive offloaded stripe update.
@@ -324,6 +326,7 @@ agg_clear_extents(struct ec_agg_entry *entry)
324326
D_ASSERT(entry->ae_cur_stripe.as_extent_cnt == 0);
325327
}
326328
entry->ae_cur_stripe.as_hi_epoch = 0UL;
329+
entry->ae_cur_stripe.as_lo_epoch = 0UL;
327330
entry->ae_cur_stripe.as_stripe_fill = 0;
328331
entry->ae_cur_stripe.as_has_holes = carry_is_hole ? true : false;
329332
}
@@ -1858,7 +1861,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
18581861
* and all replica extents are newer than parity.
18591862
*/
18601863
if (ec_age_stripe_full(entry, ec_age_with_parity(entry))) {
1861-
rc = agg_encode_local_parity(entry);
1864+
if (entry->ae_is_leader) {
1865+
rc = agg_encode_local_parity(entry);
1866+
} else {
1867+
update_vos = false;
1868+
agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph,
1869+
entry->ae_cur_stripe.as_lo_epoch);
1870+
}
18621871
goto out;
18631872
}
18641873

@@ -1868,6 +1877,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
18681877
goto out;
18691878
}
18701879

1880+
if (!entry->ae_is_leader) {
1881+
update_vos = false;
1882+
agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph,
1883+
entry->ae_cur_stripe.as_lo_epoch);
1884+
goto out;
1885+
}
1886+
18711887
/* With parity and some newer partial replicas, possibly holes */
18721888
if (ec_age_with_hole(entry))
18731889
process_holes = true;
@@ -1951,13 +1967,19 @@ agg_extent_add(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry,
19511967
agg_in_stripe(agg_entry, recx);
19521968
}
19531969

1970+
if (agg_entry->ae_cur_stripe.as_lo_epoch == 0 ||
1971+
extent->ae_epoch < agg_entry->ae_cur_stripe.as_lo_epoch)
1972+
agg_entry->ae_cur_stripe.as_lo_epoch = extent->ae_epoch;
1973+
19541974
if (extent->ae_epoch > agg_entry->ae_cur_stripe.as_hi_epoch)
19551975
agg_entry->ae_cur_stripe.as_hi_epoch = extent->ae_epoch;
19561976

1957-
D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u\n",
1977+
D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u"
1978+
"max/min "DF_X64"/"DF_X64"\n",
19581979
DP_RECX(extent->ae_recx),
19591980
agg_stripenum(agg_entry, extent->ae_recx.rx_idx),
1960-
agg_entry->ae_oid.id_shard);
1981+
agg_entry->ae_oid.id_shard, agg_entry->ae_cur_stripe.as_hi_epoch,
1982+
agg_entry->ae_cur_stripe.as_lo_epoch);
19611983
out:
19621984
return rc;
19631985
}
@@ -1973,9 +1995,9 @@ agg_data_extent(struct ec_agg_param *agg_param, vos_iter_entry_t *entry,
19731995

19741996
D_ASSERT(!(entry->ie_recx.rx_idx & PARITY_INDICATOR));
19751997

1976-
D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", %u\n",
1998+
D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", "DF_X64"/%u leader %s\n",
19771999
DP_UOID(agg_entry->ae_oid), DP_RECX(entry->ie_recx),
1978-
entry->ie_minor_epc);
2000+
entry->ie_epoch, entry->ie_minor_epc, agg_entry->ae_is_leader ? "yes" : "no");
19792001

19802002
while (offset < end) {
19812003
daos_off_t this_stripenum;
@@ -2038,6 +2060,7 @@ agg_akey_post(daos_handle_t ih, struct ec_agg_param *agg_param,
20382060

20392061
agg_entry->ae_cur_stripe.as_stripenum = 0UL;
20402062
agg_entry->ae_cur_stripe.as_hi_epoch = 0UL;
2063+
agg_entry->ae_cur_stripe.as_lo_epoch = 0UL;
20412064
agg_entry->ae_cur_stripe.as_stripe_fill = 0UL;
20422065
agg_entry->ae_cur_stripe.as_offset = 0U;
20432066
}
@@ -2073,39 +2096,57 @@ agg_reset_pos(vos_iter_type_t type, struct ec_agg_entry *agg_entry)
20732096
}
20742097
}
20752098

2076-
static int
2077-
agg_shard_is_leader(struct ds_pool *pool, struct ec_agg_entry *agg_entry)
2099+
static bool
2100+
agg_shard_is_parity(struct ds_pool *pool, struct ec_agg_entry *agg_entry)
20782101
{
2079-
struct pl_obj_shard *shard;
20802102
struct daos_oclass_attr *oca;
20812103
uint32_t grp_idx;
20822104
uint32_t grp_start;
2083-
uint32_t ec_tgt_idx;
2084-
int shard_idx;
2085-
int rc;
2105+
uint32_t min_fseq = -1;
2106+
int leader_shard = -1;
2107+
int i;
20862108

20872109
oca = &agg_entry->ae_oca;
2110+
if (is_ec_data_shard_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2111+
agg_entry->ae_dkey_hash, oca,
2112+
agg_entry->ae_oid.id_shard)) {
2113+
agg_entry->ae_is_leader = 0;
2114+
return false;
2115+
}
2116+
20882117
grp_idx = agg_entry->ae_oid.id_shard / daos_oclass_grp_size(oca);
2089-
grp_start = grp_idx * daos_oclass_grp_size(oca);
2090-
ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2091-
agg_entry->ae_dkey_hash, oca,
2092-
daos_oclass_grp_size(oca) - 1);
2093-
/**
2094-
* FIXME: only the last parity shard can be the EC agg leader. What about
2095-
* Degraded mode?
2096-
*/
2097-
if (agg_entry->ae_oid.id_shard != ec_tgt_idx + grp_start)
2098-
return 0;
2118+
grp_start = grp_idx * agg_entry->ae_obj_layout->ol_grp_size;
2119+
for (i = 0; i < obj_ec_parity_tgt_nr(oca); i++) {
2120+
uint32_t ec_tgt_idx;
2121+
uint32_t shard_idx;
2122+
struct pl_obj_shard *shard;
2123+
2124+
ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2125+
agg_entry->ae_dkey_hash, oca,
2126+
daos_oclass_grp_size(oca) - i - 1);
2127+
2128+
shard_idx = grp_start + ec_tgt_idx;
2129+
shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx);
20992130

2100-
/* If last parity unavailable, then skip the object via returning -DER_STALE. */
2101-
shard_idx = grp_idx * agg_entry->ae_obj_layout->ol_grp_size + ec_tgt_idx;
2102-
shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx);
2103-
if (shard->po_target != -1 && shard->po_shard != -1 && !shard->po_rebuilding)
2104-
rc = (agg_entry->ae_oid.id_shard == shard->po_shard) ? 1 : 0;
2131+
if (shard->po_target == -1 || shard->po_shard == -1 || shard->po_rebuilding)
2132+
continue;
2133+
2134+
if (min_fseq == -1 || min_fseq > shard->po_fseq) {
2135+
leader_shard = shard_idx;
2136+
min_fseq = shard->po_fseq;
2137+
}
2138+
}
2139+
2140+
/* No parity shard is available */
2141+
if (leader_shard == -1)
2142+
return false;
2143+
2144+
if (agg_entry->ae_oid.id_shard == leader_shard)
2145+
agg_entry->ae_is_leader = 1;
21052146
else
2106-
rc = -DER_STALE;
2147+
agg_entry->ae_is_leader = 0;
21072148

2108-
return rc;
2149+
return true;
21092150
}
21102151

21112152
/* Initializes the struct holding the iteration state (ec_agg_entry). */
@@ -2129,8 +2170,6 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry,
21292170
struct ec_agg_param *agg_param, struct ec_agg_entry *agg_entry,
21302171
unsigned int *acts)
21312172
{
2132-
int rc;
2133-
21342173
if (!agg_key_compare(agg_entry->ae_dkey, entry->ie_key)) {
21352174
D_DEBUG(DB_EPC, "Skip dkey: "DF_KEY" ec agg on re-probe\n",
21362175
DP_KEY(&entry->ie_key));
@@ -2144,24 +2183,16 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry,
21442183
agg_entry->ae_dkey_hash = obj_dkey2hash(agg_entry->ae_oid.id_pub,
21452184
&agg_entry->ae_dkey);
21462185
agg_reset_pos(VOS_ITER_AKEY, agg_entry);
2147-
rc = agg_shard_is_leader(agg_param->ap_pool_info.api_pool, agg_entry);
2148-
if (rc == 1) {
2149-
D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting\n",
2150-
DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey));
2186+
if(agg_shard_is_parity(agg_param->ap_pool_info.api_pool, agg_entry)) {
2187+
D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting leader %s\n",
2188+
DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey),
2189+
agg_entry->ae_is_leader ? "yes" : "no");
21512190
agg_reset_dkey_entry(&agg_param->ap_agg_entry, entry);
2152-
rc = 0;
21532191
} else {
2154-
if (rc < 0) {
2155-
D_ERROR("oid:"DF_UOID" ds_pool_check_leader failed "
2156-
DF_RC"\n", DP_UOID(entry->ie_oid), DP_RC(rc));
2157-
if (rc == -DER_STALE)
2158-
agg_param->ap_obj_skipped = 1;
2159-
rc = 0;
2160-
}
21612192
*acts |= VOS_ITER_CB_SKIP;
21622193
}
21632194

2164-
return rc;
2195+
return 0;
21652196
}
21662197

21672198
/* Handles akeys returned by the iterator. */
@@ -2625,7 +2656,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26252656

26262657
agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL);
26272658

2628-
ec_agg_param->ap_obj_skipped = 0;
2659+
ec_agg_param->ap_min_unagg_eph = DAOS_EPOCH_MAX;
26292660
rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors,
26302661
agg_iterate_pre_cb, agg_iterate_post_cb, ec_agg_param, NULL);
26312662

@@ -2637,8 +2668,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26372668
ec_agg_param->ap_agg_entry.ae_obj_hdl = DAOS_HDL_INVAL;
26382669
}
26392670

2640-
if (ec_agg_param->ap_obj_skipped && !cont->sc_stopping) {
2641-
D_DEBUG(DB_EPC, "with skipped obj during aggregation.\n");
2671+
if (cont->sc_pool->spc_pool->sp_rebuilding > 0 && !cont->sc_stopping) {
26422672
/* There is rebuild going on, and we can't proceed EC aggregate boundary,
26432673
* Let's wait for 5 seconds for another EC aggregation.
26442674
*/
@@ -2649,7 +2679,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26492679
vos_aggregate_exit(cont->sc_hdl);
26502680

26512681
update_hae:
2652-
if (rc == 0 && ec_agg_param->ap_obj_skipped == 0) {
2682+
if (rc == 0) {
26532683
cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi);
26542684
if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) {
26552685
uint64_t orig, cur;
@@ -2662,7 +2692,8 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26622692
DP_CONT(cont->sc_pool_uuid, cont->sc_uuid),
26632693
orig, cur, cur - orig);
26642694

2665-
*cont->sc_ec_query_agg_eph = cont->sc_ec_agg_eph;
2695+
*cont->sc_ec_query_agg_eph = min(ec_agg_param->ap_min_unagg_eph,
2696+
cont->sc_ec_agg_eph);
26662697
}
26672698
}
26682699

src/object/srv_obj_migrate.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,12 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh,
996996

997997
offset = iods[i].iod_recxs[0].rx_idx;
998998
size = iods[i].iod_recxs[0].rx_nr;
999-
parity_eph = ephs[i][0];
999+
/* Use stable epoch for partial parity update to make sure
1000+
* these partial updates are not below stable epoch boundary,
1001+
* otherwise both EC and VOS aggregation might operate on
1002+
* the same recxs.
1003+
*/
1004+
parity_eph = encode ? ephs[i][0] : mrone->mo_epoch;
10001005
tmp_iod = iods[i];
10011006
ptr = iov[i].iov_buf;
10021007
for (j = 1; j < iods[i].iod_nr; j++) {

src/pool/srv_target.c

+1
Original file line numberDiff line numberDiff line change
@@ -1614,6 +1614,7 @@ update_child_map(void *data)
16141614
return 0;
16151615
}
16161616

1617+
ds_cont_child_reset_ec_agg_eph_all(child);
16171618
child->spc_map_version = pool->sp_map_version;
16181619
ds_pool_child_put(child);
16191620
return 0;

src/tests/ftest/daos_test/suite.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ timeouts:
2727
test_daos_extend_simple: 3600
2828
test_daos_oid_allocator: 640
2929
test_daos_checksum: 500
30-
test_daos_rebuild_ec: 6400
30+
test_daos_rebuild_ec: 7200
3131
test_daos_aggregate_ec: 200
3232
test_daos_degraded_ec: 1900
3333
test_daos_dedup: 220

src/tests/suite/daos_rebuild_ec.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright 2016-2023 Intel Corporation.
2+
* (C) Copyright 2016-2024 Intel Corporation.
33
*
44
* SPDX-License-Identifier: BSD-2-Clause-Patent
55
*/
@@ -1304,6 +1304,9 @@ rebuild_ec_parity_overwrite_fail_parity_internal(void **state, int *kill_shards,
13041304
parity_rank = get_rank_by_oid_shard(arg, oid, shard_idx);
13051305
rebuild_single_pool_rank(arg, parity_rank, true);
13061306

1307+
print_message("sleep 60 seconds for aggregation\n");
1308+
sleep(60);
1309+
13071310
/* fail data shard */
13081311
for (i = 0; i < nr; i++) {
13091312
shard_idx = (dkey_hash % 6 + kill_shards[i]) % 6;
@@ -1487,7 +1490,7 @@ static const struct CMUnitTest rebuild_tests[] = {
14871490
{"REBUILD46: fail parity shard and data shards after overwrite",
14881491
rebuild_ec_overwrite_fail_parity_data, rebuild_ec_8nodes_setup,
14891492
test_teardown},
1490-
{"REBUILD46: fail parity shard and data shards after overwrite with aggregation",
1493+
{"REBUILD47: fail parity shard and data shards after overwrite with aggregation",
14911494
rebuild_ec_overwrite_fail_parity_data_with_parity, rebuild_ec_8nodes_setup,
14921495
test_teardown},
14931496
};

0 commit comments

Comments
 (0)