Skip to content

Commit a8a5381

Browse files
wangdi1jolivier23
authored andcommitted
DAOS-14598 object: correct epoch for parity migration (#13453)
Use stable epoch for partial parity update to make sure these partial updates are not below stable epoch boundary, otherwise both EC and VOS aggregation might operate on the same recxs at the same time, which can corrupt the data during rebuild. During EC aggregation, it should consider the un-aggregate epoch on non-leader parity as well, otherwise if the leader parity failed, which will be excluded from global EC stable epoch calculation immediately, then before the leader parity is being rebuilt, the global stable epoch might pass the un-aggregated epoch on the failed target, then these partial update on the data shard might be aggregated before EC aggregation, which might cause data corruption. And also it should choose a less fseq shard among all parity shards as the aggregate leader, in case the last parity can not be rebuilt in time. Signed-off-by: Di Wang <[email protected]>
1 parent 1372e4f commit a8a5381

File tree

7 files changed

+109
-56
lines changed

7 files changed

+109
-56
lines changed

src/container/srv_target.c

+12
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,18 @@ ds_cont_child_stop_all(struct ds_pool_child *pool_child)
851851
}
852852
}
853853

854+
void
855+
ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child)
856+
{
857+
struct ds_cont_child *cont_child;
858+
859+
D_DEBUG(DB_MD, DF_UUID"[%d]: reset all containers EC aggregate epoch.\n",
860+
DP_UUID(pool_child->spc_uuid), dss_get_module_info()->dmi_tgt_id);
861+
862+
d_list_for_each_entry(cont_child, &pool_child->spc_cont_list, sc_link)
863+
cont_child->sc_ec_agg_eph = cont_child->sc_ec_agg_eph_boundary;
864+
}
865+
854866
static int
855867
cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
856868
bool *started, struct ds_cont_child **cont_out)

src/include/daos_srv/container.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,8 @@ void ds_cont_child_stop_all(struct ds_pool_child *pool_child);
185185

186186
int ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid,
187187
struct ds_cont_child **ds_cont);
188-
188+
void
189+
ds_cont_child_reset_ec_agg_eph_all(struct ds_pool_child *pool_child);
189190
/** initialize a csummer based on container properties. Will retrieve the
190191
* checksum related properties from IV
191192
*/

src/object/srv_ec_aggregate.c

+82-51
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright 2020-2023 Intel Corporation.
2+
* (C) Copyright 2020-2024 Intel Corporation.
33
*
44
* SPDX-License-Identifier: BSD-2-Clause-Patent
55
*/
@@ -89,6 +89,7 @@ struct ec_agg_par_extent {
8989
struct ec_agg_stripe {
9090
daos_off_t as_stripenum; /* ordinal of stripe, offset/(k*len) */
9191
daos_epoch_t as_hi_epoch; /* highest epoch in stripe */
92+
daos_epoch_t as_lo_epoch; /* lowest epoch in stripe */
9293
d_list_t as_dextents; /* list of stripe's data extents */
9394
daos_off_t as_stripe_fill; /* amount of stripe covered by data */
9495
uint64_t as_offset; /* start offset in stripe */
@@ -114,6 +115,7 @@ struct ec_agg_entry {
114115
struct pl_obj_layout *ae_obj_layout;
115116
struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P];
116117
uint32_t ae_grp_idx;
118+
uint32_t ae_is_leader:1;
117119
};
118120

119121
/* Parameters used to drive iterate all.
@@ -123,13 +125,13 @@ struct ec_agg_param {
123125
struct ec_agg_entry ap_agg_entry; /* entry used for each OID */
124126
daos_epoch_range_t ap_epr; /* hi/lo extent threshold */
125127
daos_epoch_t ap_filter_eph; /* Aggregatable filter epoch */
128+
daos_epoch_t ap_min_unagg_eph; /* minimum unaggregate epoch */
126129
daos_handle_t ap_cont_handle; /* VOS container handle */
127130
int (*ap_yield_func)(void *arg); /* yield function*/
128131
void *ap_yield_arg; /* yield argument */
129132
uint32_t ap_credits_max; /* # of tight loops to yield */
130133
uint32_t ap_credits; /* # of tight loops */
131-
uint32_t ap_initialized:1, /* initialized flag */
132-
ap_obj_skipped:1; /* skipped obj during aggregation */
134+
uint32_t ap_initialized:1; /* initialized flag */
133135
};
134136

135137
/* Struct used to drive offloaded stripe update.
@@ -324,6 +326,7 @@ agg_clear_extents(struct ec_agg_entry *entry)
324326
D_ASSERT(entry->ae_cur_stripe.as_extent_cnt == 0);
325327
}
326328
entry->ae_cur_stripe.as_hi_epoch = 0UL;
329+
entry->ae_cur_stripe.as_lo_epoch = 0UL;
327330
entry->ae_cur_stripe.as_stripe_fill = 0;
328331
entry->ae_cur_stripe.as_has_holes = carry_is_hole ? true : false;
329332
}
@@ -1835,7 +1838,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
18351838
* and all replica extents are newer than parity.
18361839
*/
18371840
if (ec_age_stripe_full(entry, ec_age_with_parity(entry))) {
1838-
rc = agg_encode_local_parity(entry);
1841+
if (entry->ae_is_leader) {
1842+
rc = agg_encode_local_parity(entry);
1843+
} else {
1844+
update_vos = false;
1845+
agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph,
1846+
entry->ae_cur_stripe.as_lo_epoch);
1847+
}
18391848
goto out;
18401849
}
18411850

@@ -1845,6 +1854,13 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
18451854
goto out;
18461855
}
18471856

1857+
if (!entry->ae_is_leader) {
1858+
update_vos = false;
1859+
agg_param->ap_min_unagg_eph = min(agg_param->ap_min_unagg_eph,
1860+
entry->ae_cur_stripe.as_lo_epoch);
1861+
goto out;
1862+
}
1863+
18481864
/* With parity and some newer partial replicas, possibly holes */
18491865
if (ec_age_with_hole(entry))
18501866
process_holes = true;
@@ -1928,13 +1944,19 @@ agg_extent_add(struct ec_agg_entry *agg_entry, vos_iter_entry_t *entry,
19281944
agg_in_stripe(agg_entry, recx);
19291945
}
19301946

1947+
if (agg_entry->ae_cur_stripe.as_lo_epoch == 0 ||
1948+
extent->ae_epoch < agg_entry->ae_cur_stripe.as_lo_epoch)
1949+
agg_entry->ae_cur_stripe.as_lo_epoch = extent->ae_epoch;
1950+
19311951
if (extent->ae_epoch > agg_entry->ae_cur_stripe.as_hi_epoch)
19321952
agg_entry->ae_cur_stripe.as_hi_epoch = extent->ae_epoch;
19331953

1934-
D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u\n",
1954+
D_DEBUG(DB_TRACE, "adding extent "DF_RECX", to stripe %lu, shard: %u"
1955+
"max/min "DF_X64"/"DF_X64"\n",
19351956
DP_RECX(extent->ae_recx),
19361957
agg_stripenum(agg_entry, extent->ae_recx.rx_idx),
1937-
agg_entry->ae_oid.id_shard);
1958+
agg_entry->ae_oid.id_shard, agg_entry->ae_cur_stripe.as_hi_epoch,
1959+
agg_entry->ae_cur_stripe.as_lo_epoch);
19381960
out:
19391961
return rc;
19401962
}
@@ -1950,9 +1972,9 @@ agg_data_extent(struct ec_agg_param *agg_param, vos_iter_entry_t *entry,
19501972

19511973
D_ASSERT(!(entry->ie_recx.rx_idx & PARITY_INDICATOR));
19521974

1953-
D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", %u\n",
1975+
D_DEBUG(DB_IO, DF_UOID" get recx "DF_RECX", "DF_X64"/%u leader %s\n",
19541976
DP_UOID(agg_entry->ae_oid), DP_RECX(entry->ie_recx),
1955-
entry->ie_minor_epc);
1977+
entry->ie_epoch, entry->ie_minor_epc, agg_entry->ae_is_leader ? "yes" : "no");
19561978

19571979
while (offset < end) {
19581980
daos_off_t this_stripenum;
@@ -2015,6 +2037,7 @@ agg_akey_post(daos_handle_t ih, struct ec_agg_param *agg_param,
20152037

20162038
agg_entry->ae_cur_stripe.as_stripenum = 0UL;
20172039
agg_entry->ae_cur_stripe.as_hi_epoch = 0UL;
2040+
agg_entry->ae_cur_stripe.as_lo_epoch = 0UL;
20182041
agg_entry->ae_cur_stripe.as_stripe_fill = 0UL;
20192042
agg_entry->ae_cur_stripe.as_offset = 0U;
20202043
}
@@ -2050,39 +2073,57 @@ agg_reset_pos(vos_iter_type_t type, struct ec_agg_entry *agg_entry)
20502073
}
20512074
}
20522075

2053-
static int
2054-
agg_shard_is_leader(struct ds_pool *pool, struct ec_agg_entry *agg_entry)
2076+
static bool
2077+
agg_shard_is_parity(struct ds_pool *pool, struct ec_agg_entry *agg_entry)
20552078
{
2056-
struct pl_obj_shard *shard;
20572079
struct daos_oclass_attr *oca;
20582080
uint32_t grp_idx;
20592081
uint32_t grp_start;
2060-
uint32_t ec_tgt_idx;
2061-
int shard_idx;
2062-
int rc;
2082+
uint32_t min_fseq = -1;
2083+
int leader_shard = -1;
2084+
int i;
20632085

20642086
oca = &agg_entry->ae_oca;
2087+
if (is_ec_data_shard_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2088+
agg_entry->ae_dkey_hash, oca,
2089+
agg_entry->ae_oid.id_shard)) {
2090+
agg_entry->ae_is_leader = 0;
2091+
return false;
2092+
}
2093+
20652094
grp_idx = agg_entry->ae_oid.id_shard / daos_oclass_grp_size(oca);
2066-
grp_start = grp_idx * daos_oclass_grp_size(oca);
2067-
ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2068-
agg_entry->ae_dkey_hash, oca,
2069-
daos_oclass_grp_size(oca) - 1);
2070-
/**
2071-
* FIXME: only the last parity shard can be the EC agg leader. What about
2072-
* Degraded mode?
2073-
*/
2074-
if (agg_entry->ae_oid.id_shard != ec_tgt_idx + grp_start)
2075-
return 0;
2095+
grp_start = grp_idx * agg_entry->ae_obj_layout->ol_grp_size;
2096+
for (i = 0; i < obj_ec_parity_tgt_nr(oca); i++) {
2097+
uint32_t ec_tgt_idx;
2098+
uint32_t shard_idx;
2099+
struct pl_obj_shard *shard;
2100+
2101+
ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver,
2102+
agg_entry->ae_dkey_hash, oca,
2103+
daos_oclass_grp_size(oca) - i - 1);
2104+
2105+
shard_idx = grp_start + ec_tgt_idx;
2106+
shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx);
20762107

2077-
/* If last parity unavailable, then skip the object via returning -DER_STALE. */
2078-
shard_idx = grp_idx * agg_entry->ae_obj_layout->ol_grp_size + ec_tgt_idx;
2079-
shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx);
2080-
if (shard->po_target != -1 && shard->po_shard != -1 && !shard->po_rebuilding)
2081-
rc = (agg_entry->ae_oid.id_shard == shard->po_shard) ? 1 : 0;
2108+
if (shard->po_target == -1 || shard->po_shard == -1 || shard->po_rebuilding)
2109+
continue;
2110+
2111+
if (min_fseq == -1 || min_fseq > shard->po_fseq) {
2112+
leader_shard = shard_idx;
2113+
min_fseq = shard->po_fseq;
2114+
}
2115+
}
2116+
2117+
/* No parity shard is available */
2118+
if (leader_shard == -1)
2119+
return false;
2120+
2121+
if (agg_entry->ae_oid.id_shard == leader_shard)
2122+
agg_entry->ae_is_leader = 1;
20822123
else
2083-
rc = -DER_STALE;
2124+
agg_entry->ae_is_leader = 0;
20842125

2085-
return rc;
2126+
return true;
20862127
}
20872128

20882129
/* Initializes the struct holding the iteration state (ec_agg_entry). */
@@ -2106,8 +2147,6 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry,
21062147
struct ec_agg_param *agg_param, struct ec_agg_entry *agg_entry,
21072148
unsigned int *acts)
21082149
{
2109-
int rc;
2110-
21112150
if (!agg_key_compare(agg_entry->ae_dkey, entry->ie_key)) {
21122151
D_DEBUG(DB_EPC, "Skip dkey: "DF_KEY" ec agg on re-probe\n",
21132152
DP_KEY(&entry->ie_key));
@@ -2121,24 +2160,16 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry,
21212160
agg_entry->ae_dkey_hash = obj_dkey2hash(agg_entry->ae_oid.id_pub,
21222161
&agg_entry->ae_dkey);
21232162
agg_reset_pos(VOS_ITER_AKEY, agg_entry);
2124-
rc = agg_shard_is_leader(agg_param->ap_pool_info.api_pool, agg_entry);
2125-
if (rc == 1) {
2126-
D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting\n",
2127-
DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey));
2163+
if(agg_shard_is_parity(agg_param->ap_pool_info.api_pool, agg_entry)) {
2164+
D_DEBUG(DB_EPC, "oid:"DF_UOID":"DF_KEY" ec agg starting leader %s\n",
2165+
DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey),
2166+
agg_entry->ae_is_leader ? "yes" : "no");
21282167
agg_reset_dkey_entry(&agg_param->ap_agg_entry, entry);
2129-
rc = 0;
21302168
} else {
2131-
if (rc < 0) {
2132-
D_ERROR("oid:"DF_UOID" ds_pool_check_leader failed "
2133-
DF_RC"\n", DP_UOID(entry->ie_oid), DP_RC(rc));
2134-
if (rc == -DER_STALE)
2135-
agg_param->ap_obj_skipped = 1;
2136-
rc = 0;
2137-
}
21382169
*acts |= VOS_ITER_CB_SKIP;
21392170
}
21402171

2141-
return rc;
2172+
return 0;
21422173
}
21432174

21442175
/* Handles akeys returned by the iterator. */
@@ -2599,7 +2630,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
25992630

26002631
agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL);
26012632

2602-
ec_agg_param->ap_obj_skipped = 0;
2633+
ec_agg_param->ap_min_unagg_eph = DAOS_EPOCH_MAX;
26032634
rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors,
26042635
agg_iterate_pre_cb, agg_iterate_post_cb, ec_agg_param, NULL);
26052636

@@ -2611,8 +2642,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26112642
ec_agg_param->ap_agg_entry.ae_obj_hdl = DAOS_HDL_INVAL;
26122643
}
26132644

2614-
if (ec_agg_param->ap_obj_skipped && !cont->sc_stopping) {
2615-
D_DEBUG(DB_EPC, "with skipped obj during aggregation.\n");
2645+
if (cont->sc_pool->spc_pool->sp_rebuilding > 0 && !cont->sc_stopping) {
26162646
/* There is rebuild going on, and we can't proceed EC aggregate boundary,
26172647
* Let's wait for 5 seconds for another EC aggregation.
26182648
*/
@@ -2623,7 +2653,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26232653
vos_aggregate_exit(cont->sc_hdl);
26242654

26252655
update_hae:
2626-
if (rc == 0 && ec_agg_param->ap_obj_skipped == 0) {
2656+
if (rc == 0) {
26272657
cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi);
26282658
if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) {
26292659
uint64_t orig, cur;
@@ -2636,7 +2666,8 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
26362666
DP_CONT(cont->sc_pool_uuid, cont->sc_uuid),
26372667
orig, cur, cur - orig);
26382668

2639-
*cont->sc_ec_query_agg_eph = cont->sc_ec_agg_eph;
2669+
*cont->sc_ec_query_agg_eph = min(ec_agg_param->ap_min_unagg_eph,
2670+
cont->sc_ec_agg_eph);
26402671
}
26412672
}
26422673

src/object/srv_obj_migrate.c

+6-1
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,12 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh,
977977

978978
offset = iods[i].iod_recxs[0].rx_idx;
979979
size = iods[i].iod_recxs[0].rx_nr;
980-
parity_eph = ephs[i][0];
980+
/* Use stable epoch for partial parity update to make sure
981+
* these partial updates are not below stable epoch boundary,
982+
* otherwise both EC and VOS aggregation might operate on
983+
* the same recxs.
984+
*/
985+
parity_eph = encode ? ephs[i][0] : mrone->mo_epoch;
981986
tmp_iod = iods[i];
982987
ptr = iov[i].iov_buf;
983988
for (j = 1; j < iods[i].iod_nr; j++) {

src/pool/srv_target.c

+1
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,7 @@ update_child_map(void *data)
14731473
if (child == NULL)
14741474
return -DER_NONEXIST;
14751475

1476+
ds_cont_child_reset_ec_agg_eph_all(child);
14761477
child->spc_map_version = pool->sp_map_version;
14771478
ds_pool_child_put(child);
14781479
return 0;

src/tests/ftest/daos_test/suite.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ timeouts:
2727
test_daos_extend_simple: 3600
2828
test_daos_oid_allocator: 640
2929
test_daos_checksum: 500
30-
test_daos_rebuild_ec: 6400
30+
test_daos_rebuild_ec: 7200
3131
test_daos_aggregate_ec: 200
3232
test_daos_degraded_ec: 1900
3333
test_daos_dedup: 220

src/tests/suite/daos_rebuild_ec.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright 2016-2023 Intel Corporation.
2+
* (C) Copyright 2016-2024 Intel Corporation.
33
*
44
* SPDX-License-Identifier: BSD-2-Clause-Patent
55
*/
@@ -1304,6 +1304,9 @@ rebuild_ec_parity_overwrite_fail_parity_internal(void **state, int *kill_shards,
13041304
parity_rank = get_rank_by_oid_shard(arg, oid, shard_idx);
13051305
rebuild_single_pool_rank(arg, parity_rank, true);
13061306

1307+
print_message("sleep 60 seconds for aggregation\n");
1308+
sleep(60);
1309+
13071310
/* fail data shard */
13081311
for (i = 0; i < nr; i++) {
13091312
shard_idx = (dkey_hash % 6 + kill_shards[i]) % 6;
@@ -1487,7 +1490,7 @@ static const struct CMUnitTest rebuild_tests[] = {
14871490
{"REBUILD46: fail parity shard and data shards after overwrite",
14881491
rebuild_ec_overwrite_fail_parity_data, rebuild_ec_8nodes_setup,
14891492
test_teardown},
1490-
{"REBUILD46: fail parity shard and data shards after overwrite with aggregation",
1493+
{"REBUILD47: fail parity shard and data shards after overwrite with aggregation",
14911494
rebuild_ec_overwrite_fail_parity_data_with_parity, rebuild_ec_8nodes_setup,
14921495
test_teardown},
14931496
};

0 commit comments

Comments
 (0)