From c5ce565c4ccd689efeff1bf44c858c9229e57c96 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Thu, 15 Jun 2023 10:00:21 +0000 Subject: [PATCH 1/4] DAOS-13380 engine: refine tgt_nr check 1. for non-DAOS_TARGET_OVERSUBSCRIBE case fail to start engine if #cores is not enough 2. for DAOS_TARGET_OVERSUBSCRIBE case allow to force start engine The #nr_xs_helpers possibly be reduced for either case. Required-githooks: true Signed-off-by: Xuezhao Liu --- src/engine/init.c | 91 ++++++++++--------- .../control/dmg_server_set_logmasks.yaml | 1 + src/tests/ftest/harness/core_files.yaml | 1 + src/tests/ftest/pool/create_all_vm.yaml | 2 +- utils/nlt_server.yaml | 1 + 5 files changed, 50 insertions(+), 46 deletions(-) diff --git a/src/engine/init.c b/src/engine/init.c index 1e892e50828..f972bdb4658 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -36,8 +36,7 @@ static char modules[MAX_MODULE_OPTIONS + 1]; /** - * Number of target threads the user would like to start - * 0 means default value, see dss_tgt_nr_get(); + * Number of target threads the user would like to start. */ static unsigned int nr_threads; @@ -246,56 +245,61 @@ modules_load(void) return rc; } +static unsigned int +ncores_needed(unsigned int tgt_nr, unsigned int nr_helpers) +{ + return DAOS_TGT0_OFFSET + tgt_nr + nr_helpers; +} + /** - * Get the appropriate number of main XS based on the number of cores and - * passed in preferred number of threads. + * Check if the #targets and #nr_xs_helpers is valid to start server, the #nr_xs_helpers possibly + * be reduced. */ static int -dss_tgt_nr_get(unsigned int ncores, unsigned int nr, bool oversubscribe) +dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe) { - int tgt_nr; - D_ASSERT(ncores >= 1); /* at most 2 helper XS per target */ - if (dss_tgt_offload_xs_nr > 2 * nr) - dss_tgt_offload_xs_nr = 2 * nr; - else if (dss_tgt_offload_xs_nr == 0) + if (dss_tgt_offload_xs_nr > 2 * tgt_nr) { + D_PRINT("#nr_xs_helpers(%d) cannot exceed 2 times #targets (2 x %d = %d).\n", + dss_tgt_offload_xs_nr, tgt_nr, 2 * tgt_nr); + dss_tgt_offload_xs_nr = 2 * tgt_nr; + } else if (dss_tgt_offload_xs_nr == 0) { D_WARN("Suggest to config at least 1 helper XS per DAOS engine\n"); + } - /* Each system XS uses one core, and with dss_tgt_offload_xs_nr - * offload XS. Calculate the tgt_nr as the number of main XS based - * on number of cores. - */ -retry: - tgt_nr = ncores - DAOS_TGT0_OFFSET - dss_tgt_offload_xs_nr; - if (tgt_nr <= 0) - tgt_nr = 1; - - /* If user requires less target threads then set it as dss_tgt_nr, - * if user oversubscribes, then: - * . if oversubscribe is enabled, use the required number - * . if oversubscribe is disabled(default), - * use the number calculated above - * Note: oversubscribing may hurt performance. - */ - if (nr >= 1 && ((nr < tgt_nr) || oversubscribe)) { - tgt_nr = nr; - if (dss_tgt_offload_xs_nr > 2 * tgt_nr) - dss_tgt_offload_xs_nr = 2 * tgt_nr; - } else if (dss_tgt_offload_xs_nr > 2 * tgt_nr) { - dss_tgt_offload_xs_nr--; - goto retry; + if (oversubscribe) { + if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) { + if (ncores > DAOS_TGT0_OFFSET + tgt_nr) + dss_tgt_offload_xs_nr = ncores - DAOS_TGT0_OFFSET - tgt_nr; + else + dss_tgt_offload_xs_nr = 0; + + D_PRINT("Force to start engine with %d targets on %d cores, #nr_xs_helpers " + "set as %d.\n", + tgt_nr, ncores, dss_tgt_offload_xs_nr); + } + goto out; } - if (tgt_nr != nr) - D_PRINT("%d target XS(xstream) requested (#cores %d); " - "use (%d) target XS\n", nr, ncores, tgt_nr); + if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) { + if (ncores < DAOS_TGT0_OFFSET + tgt_nr) { + D_ERROR("cannot start engine with %d targets on %d cores, may try with " + "DAOS_TARGET_OVERSUBSCRIBE=1\n", + tgt_nr, ncores); + return -DER_INVAL; + } + dss_tgt_offload_xs_nr = ncores - DAOS_TGT0_OFFSET - tgt_nr; + D_PRINT("Start engine with %d targets on %d cores, #nr_xs_helpers set as %d.\n", + tgt_nr, ncores, dss_tgt_offload_xs_nr); + } +out: if (dss_tgt_offload_xs_nr % tgt_nr != 0) dss_helper_pool = true; - return tgt_nr; + return 0; } static int @@ -317,14 +321,12 @@ dss_topo_init() depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE); numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth); d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub); + dss_tgt_nr = nr_threads; /* if no NUMA node was specified, or NUMA data unavailable */ /* fall back to the legacy core allocation algorithm */ if (dss_numa_node == -1 || numa_node_nr <= 0) { D_PRINT("Using legacy core allocation algorithm\n"); - dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads, - tgt_oversub); - if (dss_core_offset >= dss_core_nr) { D_ERROR("invalid dss_core_offset %u " "(set by \"-f\" option)," @@ -332,7 +334,8 @@ dss_topo_init() dss_core_offset, dss_core_nr - 1); return -DER_INVAL; } - return 0; + + return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub); } if (dss_numa_node > numa_node_nr) { @@ -377,17 +380,15 @@ dss_topo_init() hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap); free(cpuset); - dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads, - tgt_oversub); if (dss_core_offset >= dss_num_cores_numa_node) { D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), " "should within range [0, %d]", dss_core_offset, dss_num_cores_numa_node - 1); return -DER_INVAL; } - D_PRINT("Using NUMA core allocation algorithm\n"); - return 0; + + return dss_tgt_nr_check(dss_num_cores_numa_node, dss_tgt_nr, tgt_oversub); } static ABT_mutex server_init_state_mutex; diff --git a/src/tests/ftest/control/dmg_server_set_logmasks.yaml b/src/tests/ftest/control/dmg_server_set_logmasks.yaml index b4e3a1ddfeb..68e58cc7488 100644 --- a/src/tests/ftest/control/dmg_server_set_logmasks.yaml +++ b/src/tests/ftest/control/dmg_server_set_logmasks.yaml @@ -6,6 +6,7 @@ server_config: engines_per_host: 1 engines: 0: + targets: 4 storage: 0: class: ram diff --git a/src/tests/ftest/harness/core_files.yaml b/src/tests/ftest/harness/core_files.yaml index 8133398a42f..04cb67f7a11 100644 --- a/src/tests/ftest/harness/core_files.yaml +++ b/src/tests/ftest/harness/core_files.yaml @@ -5,6 +5,7 @@ server_config: engines_per_host: 1 engines: 0: + targets: 4 storage: 0: class: ram diff --git a/src/tests/ftest/pool/create_all_vm.yaml b/src/tests/ftest/pool/create_all_vm.yaml index 0e030ee8cbc..2c053e5b038 100644 --- a/src/tests/ftest/pool/create_all_vm.yaml +++ b/src/tests/ftest/pool/create_all_vm.yaml @@ -32,7 +32,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 5 + targets: 4 nr_xs_helpers: 0 storage: 0: diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 4b9a1a9ffd8..5d0d2d9b3ed 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -14,6 +14,7 @@ engines: - DAOS_MD_CAP=1024 - DAOS_STRICT_SHUTDOWN=1 - CRT_CTX_SHARE_ADDR=0 + - DAOS_TARGET_OVERSUBSCRIBE=1 - ABT_STACK_OVERFLOW_CHECK=mprotect storage: - From de91aa4c66dc1decdd4902a696a0d3928d22be05 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Fri, 16 Jun 2023 17:27:05 +0100 Subject: [PATCH 2/4] updating target count based on allocated verses requested is no longer necessary Required-githooks: true Signed-off-by: Tom Nabarro --- src/control/server/ctl_storage_rpc_test.go | 6 ++++-- src/control/server/instance.go | 8 -------- src/control/server/instance_exec.go | 6 ------ 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index 272d5174b24..2c2c7d65dba 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -1516,7 +1516,10 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { var engineCfgs []*engine.Config for i, sc := range tc.storageCfgs { log.Debugf("storage cfg contains bdevs %v for engine %d", sc.Bdevs(), i) - engineCfgs = append(engineCfgs, engine.MockConfig().WithStorage(sc...)) + engineCfgs = append(engineCfgs, + engine.MockConfig(). + WithStorage(sc...). + WithTargetCount(tc.engineTargetCount[i])) } sCfg := config.DefaultServer().WithEngines(engineCfgs...) cs := mockControlService(t, log, sCfg, csbmbc, tc.smbc, tc.smsc) @@ -1568,7 +1571,6 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { } ne.setDrpcClient(newMockDrpcClient(dcc)) ne._superblock.Rank = ranklist.NewRankPtr(uint32(idx + 1)) - ne.setTargetCount(tc.engineTargetCount[idx]) for _, tc := range ne.storage.GetBdevConfigs() { tc.Bdev.DeviceRoles.OptionBits = storage.OptionBits(storage.BdevRoleAll) } diff --git a/src/control/server/instance.go b/src/control/server/instance.go index 4e7f2640525..a51576441c3 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -338,14 +338,6 @@ func (ei *EngineInstance) setHugepageSz(hpSizeMb int) { ei.runner.GetConfig().HugepageSz = hpSizeMb } -// setTargetCount updates target count in engine config. -func (ei *EngineInstance) setTargetCount(numTargets int) { - ei.Lock() - defer ei.Unlock() - - ei.runner.GetConfig().TargetCount = numTargets -} - // GetTargetCount returns the target count set for this instance. func (ei *EngineInstance) GetTargetCount() int { ei.RLock() diff --git a/src/control/server/instance_exec.go b/src/control/server/instance_exec.go index 3a31d036137..ab22cb4504f 100644 --- a/src/control/server/instance_exec.go +++ b/src/control/server/instance_exec.go @@ -90,12 +90,6 @@ func (ei *EngineInstance) finishStartup(ctx context.Context, ready *srvpb.Notify if err := ei.handleReady(ctx, ready); err != nil { return err } - // update engine target count to reflect allocated number of targets, not number requested - // when starting - // NOTE: Engine mem_size passed on engine invocation is based on the number of targets - // requested in config so if number of targets allocated doesn't match the number of - // targets requested the mem_size value may be inappropriate. - ei.setTargetCount(int(ready.GetNtgts())) ei.ready.SetTrue() From 3a2037e0c8068b04e31f29f36bb789740440f2b6 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Wed, 5 Jul 2023 03:14:10 +0000 Subject: [PATCH 3/4] DAOS-13380 engine: some fixes and refine docs Required-githooks: true Signed-off-by: Xuezhao Liu --- docs/admin/deployment.md | 8 ++++++++ src/tests/ftest/control/daos_system_query.yaml | 1 + src/tests/ftest/control/dmg_pool_query_test.yaml | 6 +++--- src/tests/ftest/pool/query_attribute.yaml | 2 +- src/tests/ftest/server/daos_server_config.yaml | 5 ----- src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml | 2 +- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index d8a3808495d..dd0820798fc 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -1356,6 +1356,14 @@ per four target threads, for example `targets: 16` and `nr_xs_helpers: 4`. The server should have sufficiently many physical cores to support the number of targets plus the additional service threads. +The `targets:` requirement is mandatory, if the number of physical cores are +not enough it will fail the starting of the daos engine, or configures with +"DAOS_TARGET_OVERSUBSCRIBE=1" to force starting daos engine with that number +of targets (possibly hurt performance as multiple XS compete on same core). + +The 'nr_xs_helpers:' requirement is not mandatory, if the number of physical +cores are not enough it will reduce the number of helper XS to start engine. + ## Storage Formatting diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml index 9b1a343c11d..3cf313475b9 100644 --- a/src/tests/ftest/control/daos_system_query.yaml +++ b/src/tests/ftest/control/daos_system_query.yaml @@ -7,6 +7,7 @@ server_config: engines: 0: pinned_numa_node: 0 + targets: 4 nr_xs_helpers: 1 fabric_iface_port: 31416 log_file: daos_server0.log diff --git a/src/tests/ftest/control/dmg_pool_query_test.yaml b/src/tests/ftest/control/dmg_pool_query_test.yaml index 3cf5b7bd137..26616c46cbb 100644 --- a/src/tests/ftest/control/dmg_pool_query_test.yaml +++ b/src/tests/ftest/control/dmg_pool_query_test.yaml @@ -8,7 +8,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 storage: auto system_ram_reserved: 64 pool: @@ -26,8 +26,8 @@ ior: exp_vals: pool_status: 0 - total_targets: 8 - active_targets: 8 + total_targets: 4 + active_targets: 4 total_engines: 1 disabled_targets: 0 version: 1 diff --git a/src/tests/ftest/pool/query_attribute.yaml b/src/tests/ftest/pool/query_attribute.yaml index d91f36ac52f..7222fedd84c 100644 --- a/src/tests/ftest/pool/query_attribute.yaml +++ b/src/tests/ftest/pool/query_attribute.yaml @@ -6,7 +6,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 nr_xs_helpers: 0 storage: 0: diff --git a/src/tests/ftest/server/daos_server_config.yaml b/src/tests/ftest/server/daos_server_config.yaml index 0a659990306..b7f01f43d1d 100644 --- a/src/tests/ftest/server/daos_server_config.yaml +++ b/src/tests/ftest/server/daos_server_config.yaml @@ -164,11 +164,6 @@ server_config_val: !mux - "nr_xs_helpers" - -10000 - "PASS" - targets_negative: - config_val: - - "targets" - - -1 - - "PASS" targets_str: config_val: - "targets" diff --git a/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml b/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml index acd2fa472df..f28a32a1733 100644 --- a/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml +++ b/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml @@ -7,7 +7,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 nr_xs_helpers: 0 storage: 0: From 3d3ac8c82c21e6e8c899dda7daa101d1f7929986 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Mon, 17 Jul 2023 07:37:05 +0000 Subject: [PATCH 4/4] DAOS-13380 engine: address comment make the #xs_helpers requirement be mandatory. Required-githooks: true Signed-off-by: Xuezhao Liu --- docs/admin/deployment.md | 12 ++++---- src/engine/init.c | 28 ++++++------------- .../ftest/control/daos_system_query.yaml | 2 +- .../control/dmg_server_set_logmasks.yaml | 2 ++ src/tests/ftest/harness/core_files.yaml | 2 ++ src/tests/ftest/server/daos_server_dump.yaml | 1 + 6 files changed, 20 insertions(+), 27 deletions(-) diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index dd0820798fc..2484829675b 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -1356,13 +1356,11 @@ per four target threads, for example `targets: 16` and `nr_xs_helpers: 4`. The server should have sufficiently many physical cores to support the number of targets plus the additional service threads. -The `targets:` requirement is mandatory, if the number of physical cores are -not enough it will fail the starting of the daos engine, or configures with -"DAOS_TARGET_OVERSUBSCRIBE=1" to force starting daos engine with that number -of targets (possibly hurt performance as multiple XS compete on same core). - -The 'nr_xs_helpers:' requirement is not mandatory, if the number of physical -cores are not enough it will reduce the number of helper XS to start engine. +The 'targets:' and 'nr_xs_helpers:' requirement are mandatory, if the number +of physical cores are not enough it will fail the starting of the daos engine +(notes that 2 cores reserved for system service), or configures with ENV +"DAOS_TARGET_OVERSUBSCRIBE=1" to force starting daos engine (possibly hurts +performance as multiple XS compete on same core). ## Storage Formatting diff --git a/src/engine/init.c b/src/engine/init.c index 47fc4a3d972..874fbf62ebf 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -274,29 +274,19 @@ dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe) } if (oversubscribe) { - if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) { - if (ncores > DAOS_TGT0_OFFSET + tgt_nr) - dss_tgt_offload_xs_nr = ncores - DAOS_TGT0_OFFSET - tgt_nr; - else - dss_tgt_offload_xs_nr = 0; - - D_PRINT("Force to start engine with %d targets on %d cores, #nr_xs_helpers " - "set as %d.\n", - tgt_nr, ncores, dss_tgt_offload_xs_nr); - } + if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) + D_PRINT("Force to start engine with %d targets %d xs_helpers on %d cores(" + "%d cores reserved for system service).\n", + tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET); goto out; } if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) { - if (ncores < DAOS_TGT0_OFFSET + tgt_nr) { - D_ERROR("cannot start engine with %d targets on %d cores, may try with " - "DAOS_TARGET_OVERSUBSCRIBE=1\n", - tgt_nr, ncores); - return -DER_INVAL; - } - dss_tgt_offload_xs_nr = ncores - DAOS_TGT0_OFFSET - tgt_nr; - D_PRINT("Start engine with %d targets on %d cores, #nr_xs_helpers set as %d.\n", - tgt_nr, ncores, dss_tgt_offload_xs_nr); + D_ERROR("cannot start engine with %d targets %d xs_helpers on %d cores, may try " + "with DAOS_TARGET_OVERSUBSCRIBE=1 or reduce #targets/#nr_xs_helpers(" + "%d cores reserved for system service).\n", + tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET); + return -DER_INVAL; } out: diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml index 3cf313475b9..0f51450e877 100644 --- a/src/tests/ftest/control/daos_system_query.yaml +++ b/src/tests/ftest/control/daos_system_query.yaml @@ -8,7 +8,7 @@ server_config: 0: pinned_numa_node: 0 targets: 4 - nr_xs_helpers: 1 + nr_xs_helpers: 0 fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR diff --git a/src/tests/ftest/control/dmg_server_set_logmasks.yaml b/src/tests/ftest/control/dmg_server_set_logmasks.yaml index 68e58cc7488..70d4d93995e 100644 --- a/src/tests/ftest/control/dmg_server_set_logmasks.yaml +++ b/src/tests/ftest/control/dmg_server_set_logmasks.yaml @@ -7,6 +7,8 @@ server_config: engines: 0: targets: 4 + env_vars: + - DAOS_TARGET_OVERSUBSCRIBE=1 storage: 0: class: ram diff --git a/src/tests/ftest/harness/core_files.yaml b/src/tests/ftest/harness/core_files.yaml index 04cb67f7a11..7655e2b30f6 100644 --- a/src/tests/ftest/harness/core_files.yaml +++ b/src/tests/ftest/harness/core_files.yaml @@ -6,6 +6,8 @@ server_config: engines: 0: targets: 4 + env_vars: + - DAOS_TARGET_OVERSUBSCRIBE=1 storage: 0: class: ram diff --git a/src/tests/ftest/server/daos_server_dump.yaml b/src/tests/ftest/server/daos_server_dump.yaml index 827edcb3934..a9ffd810f69 100644 --- a/src/tests/ftest/server/daos_server_dump.yaml +++ b/src/tests/ftest/server/daos_server_dump.yaml @@ -11,6 +11,7 @@ server_config: engines: 0: targets: 2 + nr_xs_helpers: 1 storage: 0: class: ram