Skip to content

Commit 9eef393

Browse files
authored
[pick]support cgroup v2 (#42465)
## Proposed changes pick #39991 #39374 #36663
1 parent 4a62d9e commit 9eef393

File tree

11 files changed

+405
-166
lines changed

11 files changed

+405
-166
lines changed

be/src/agent/cgroup_cpu_ctl.cpp

+277-98
Large diffs are not rendered by default.

be/src/agent/cgroup_cpu_ctl.h

+83-12
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ namespace doris {
3030

3131
// cgroup cpu.cfs_quota_us default value, it means disable cpu hard limit
3232
const static int CGROUP_CPU_HARD_LIMIT_DEFAULT_VALUE = -1;
33+
const static std::string CGROUP_V2_CPU_HARD_LIMIT_DEFAULT_VALUE = "max 100000";
3334

3435
class CgroupCpuCtl {
3536
public:
3637
virtual ~CgroupCpuCtl() = default;
37-
CgroupCpuCtl() = default;
3838
CgroupCpuCtl(uint64_t wg_id) { _wg_id = wg_id; }
3939

40-
virtual Status init();
40+
virtual Status init() = 0;
4141

4242
virtual Status add_thread_to_cgroup() = 0;
4343

@@ -48,18 +48,44 @@ class CgroupCpuCtl {
4848
// for log
4949
void get_cgroup_cpu_info(uint64_t* cpu_shares, int* cpu_hard_limit);
5050

51-
virtual Status delete_unused_cgroup_path(std::set<uint64_t>& used_wg_ids) = 0;
51+
static void init_doris_cgroup_path();
5252

53-
protected:
54-
Status write_cg_sys_file(std::string file_path, int value, std::string msg, bool is_append);
53+
static Status delete_unused_cgroup_path(std::set<uint64_t>& used_wg_ids);
54+
55+
static std::unique_ptr<CgroupCpuCtl> create_cgroup_cpu_ctl(uint64_t wg_id);
56+
57+
static bool is_a_valid_cgroup_path(std::string cg_path);
5558

59+
static uint64_t cpu_soft_limit_default_value();
60+
61+
protected:
5662
virtual Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) = 0;
5763

5864
virtual Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) = 0;
5965

60-
std::string _doris_cgroup_cpu_path;
61-
uint64_t _cpu_core_num = CpuInfo::num_cores();
62-
uint64_t _cpu_cfs_period_us = 100000;
66+
Status add_thread_to_cgroup(std::string task_file);
67+
68+
static Status write_cg_sys_file(std::string file_path, std::string value, std::string msg,
69+
bool is_append);
70+
71+
static Status init_cgroup_v2_query_path_public_file(std::string home_path,
72+
std::string query_path);
73+
74+
protected:
75+
inline static uint64_t _cpu_core_num;
76+
const static uint64_t _cpu_cfs_period_us = 100000;
77+
inline static std::string _doris_cgroup_cpu_path = "";
78+
inline static std::string _doris_cgroup_cpu_query_path = "";
79+
inline static bool _is_enable_cgroup_v1_in_env = false;
80+
inline static bool _is_enable_cgroup_v2_in_env = false;
81+
inline static bool _is_cgroup_query_path_valid = false;
82+
83+
// cgroup v2 public file
84+
inline static std::string _doris_cgroup_cpu_path_subtree_ctl_file = "";
85+
inline static std::string _cgroup_v2_query_path_subtree_ctl_file = "";
86+
inline static std::string _doris_cg_v2_procs_file = "";
87+
88+
protected:
6389
int _cpu_hard_limit = 0;
6490
std::shared_mutex _lock_mutex;
6591
bool _init_succ = false;
@@ -96,20 +122,65 @@ class CgroupCpuCtl {
96122
class CgroupV1CpuCtl : public CgroupCpuCtl {
97123
public:
98124
CgroupV1CpuCtl(uint64_t tg_id) : CgroupCpuCtl(tg_id) {}
99-
CgroupV1CpuCtl() = default;
100125
Status init() override;
101126
Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) override;
102127
Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) override;
103128
Status add_thread_to_cgroup() override;
104129

105-
Status delete_unused_cgroup_path(std::set<uint64_t>& used_wg_ids) override;
106-
107130
private:
108-
std::string _cgroup_v1_cpu_query_path;
109131
std::string _cgroup_v1_cpu_tg_path; // workload group path
110132
std::string _cgroup_v1_cpu_tg_quota_file;
111133
std::string _cgroup_v1_cpu_tg_shares_file;
112134
std::string _cgroup_v1_cpu_tg_task_file;
113135
};
114136

137+
/*
138+
NOTE: cgroup v2 directory structure
139+
1 root path:
140+
/sys/fs/cgroup
141+
142+
2 doris home path:
143+
/sys/fs/cgroup/{doris_home}/
144+
145+
3 doris home subtree_control file:
146+
/sys/fs/cgroup/{doris_home}/cgroup.subtree_control
147+
148+
4 query path:
149+
/sys/fs/cgroup/{doris_home}/query/
150+
151+
5 query path subtree_control file:
152+
/sys/fs/cgroup/{doris_home}/query/cgroup.subtree_control
153+
154+
6 query path procs file:
155+
/sys/fs/cgroup/{doris_home}/query/cgroup.procs
156+
157+
7 workload group path:
158+
/sys/fs/cgroup/{doris_home}/query/{workload_group_id}
159+
160+
8 workload grou cpu.max file:
161+
/sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cpu.max
162+
163+
9 workload grou cpu.weight file:
164+
/sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cpu.weight
165+
166+
10 workload group cgroup type file:
167+
/sys/fs/cgroup/{doris_home}/query/{workload_group_id}/cgroup.type
168+
169+
*/
170+
class CgroupV2CpuCtl : public CgroupCpuCtl {
171+
public:
172+
CgroupV2CpuCtl(uint64_t tg_id) : CgroupCpuCtl(tg_id) {}
173+
Status init() override;
174+
Status modify_cg_cpu_hard_limit_no_lock(int cpu_hard_limit) override;
175+
Status modify_cg_cpu_soft_limit_no_lock(int cpu_shares) override;
176+
Status add_thread_to_cgroup() override;
177+
178+
private:
179+
std::string _cgroup_v2_query_wg_path;
180+
std::string _cgroup_v2_query_wg_cpu_max_file;
181+
std::string _cgroup_v2_query_wg_cpu_weight_file;
182+
std::string _cgroup_v2_query_wg_thread_file;
183+
std::string _cgroup_v2_query_wg_type_file;
184+
};
185+
115186
} // namespace doris

be/src/runtime/exec_env.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class MemTracker;
8080
class StorageEngine;
8181
class ResultBufferMgr;
8282
class ResultQueueMgr;
83-
class RuntimeQueryStatiticsMgr;
83+
class RuntimeQueryStatisticsMgr;
8484
class TMasterInfo;
8585
class LoadChannelMgr;
8686
class LoadStreamMgr;
@@ -162,7 +162,7 @@ class ExecEnv {
162162
pipeline::TaskScheduler* pipeline_task_scheduler() { return _without_group_task_scheduler; }
163163
WorkloadGroupMgr* workload_group_mgr() { return _workload_group_manager; }
164164
WorkloadSchedPolicyMgr* workload_sched_policy_mgr() { return _workload_sched_mgr; }
165-
RuntimeQueryStatiticsMgr* runtime_query_statistics_mgr() {
165+
RuntimeQueryStatisticsMgr* runtime_query_statistics_mgr() {
166166
return _runtime_query_statistics_mgr;
167167
}
168168

@@ -458,7 +458,7 @@ class ExecEnv {
458458

459459
WorkloadSchedPolicyMgr* _workload_sched_mgr = nullptr;
460460

461-
RuntimeQueryStatiticsMgr* _runtime_query_statistics_mgr = nullptr;
461+
RuntimeQueryStatisticsMgr* _runtime_query_statistics_mgr = nullptr;
462462

463463
std::unique_ptr<pipeline::PipelineTracerContext> _pipeline_tracer_ctx;
464464
std::unique_ptr<segment_v2::TmpFileDirs> _tmp_file_dirs;

be/src/runtime/exec_env_init.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths,
211211

212212
// NOTE: runtime query statistics mgr could be visited by query and daemon thread
213213
// so it should be created before all query begin and deleted after all query and daemon thread stoppped
214-
_runtime_query_statistics_mgr = new RuntimeQueryStatiticsMgr();
214+
_runtime_query_statistics_mgr = new RuntimeQueryStatisticsMgr();
215+
CgroupCpuCtl::init_doris_cgroup_path();
215216

216217
std::vector<doris::CachePath> cache_paths;
217218
init_file_cache_factory(cache_paths);

be/src/runtime/runtime_query_statistics_mgr.cpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ void QueryStatisticsCtx::collect_query_statistics(TQueryStatistics* tq_s) {
3535
tq_s->__set_workload_group_id(_wg_id);
3636
}
3737

38-
void RuntimeQueryStatiticsMgr::register_query_statistics(std::string query_id,
39-
std::shared_ptr<QueryStatistics> qs_ptr,
40-
TNetworkAddress fe_addr,
41-
TQueryType::type query_type) {
38+
void RuntimeQueryStatisticsMgr::register_query_statistics(std::string query_id,
39+
std::shared_ptr<QueryStatistics> qs_ptr,
40+
TNetworkAddress fe_addr,
41+
TQueryType::type query_type) {
4242
std::lock_guard<std::shared_mutex> write_lock(_qs_ctx_map_lock);
4343
if (_query_statistics_ctx_map.find(query_id) == _query_statistics_ctx_map.end()) {
4444
_query_statistics_ctx_map[query_id] =
@@ -47,7 +47,7 @@ void RuntimeQueryStatiticsMgr::register_query_statistics(std::string query_id,
4747
_query_statistics_ctx_map.at(query_id)->_qs_list.push_back(qs_ptr);
4848
}
4949

50-
void RuntimeQueryStatiticsMgr::report_runtime_query_statistics() {
50+
void RuntimeQueryStatisticsMgr::report_runtime_query_statistics() {
5151
int64_t be_id = ExecEnv::GetInstance()->master_info()->backend_id;
5252
// 1 get query statistics map
5353
std::map<TNetworkAddress, std::map<std::string, TQueryStatistics>> fe_qs_map;
@@ -166,7 +166,7 @@ void RuntimeQueryStatiticsMgr::report_runtime_query_statistics() {
166166
}
167167
}
168168

169-
void RuntimeQueryStatiticsMgr::set_query_finished(std::string query_id) {
169+
void RuntimeQueryStatisticsMgr::set_query_finished(std::string query_id) {
170170
// NOTE: here must be a write lock
171171
std::lock_guard<std::shared_mutex> write_lock(_qs_ctx_map_lock);
172172
// when a query get query_ctx succ, but failed before create node/operator,
@@ -178,7 +178,7 @@ void RuntimeQueryStatiticsMgr::set_query_finished(std::string query_id) {
178178
}
179179
}
180180

181-
std::shared_ptr<QueryStatistics> RuntimeQueryStatiticsMgr::get_runtime_query_statistics(
181+
std::shared_ptr<QueryStatistics> RuntimeQueryStatisticsMgr::get_runtime_query_statistics(
182182
std::string query_id) {
183183
std::shared_lock<std::shared_mutex> read_lock(_qs_ctx_map_lock);
184184
if (_query_statistics_ctx_map.find(query_id) == _query_statistics_ctx_map.end()) {
@@ -191,7 +191,7 @@ std::shared_ptr<QueryStatistics> RuntimeQueryStatiticsMgr::get_runtime_query_sta
191191
return qs_ptr;
192192
}
193193

194-
void RuntimeQueryStatiticsMgr::get_metric_map(
194+
void RuntimeQueryStatisticsMgr::get_metric_map(
195195
std::string query_id, std::map<WorkloadMetricType, std::string>& metric_map) {
196196
QueryStatistics ret_qs;
197197
int64_t query_time_ms = 0;
@@ -212,15 +212,15 @@ void RuntimeQueryStatiticsMgr::get_metric_map(
212212
std::to_string(ret_qs.get_current_used_memory_bytes()));
213213
}
214214

215-
void RuntimeQueryStatiticsMgr::set_workload_group_id(std::string query_id, int64_t wg_id) {
215+
void RuntimeQueryStatisticsMgr::set_workload_group_id(std::string query_id, int64_t wg_id) {
216216
// wg id just need eventual consistency, read lock is ok
217217
std::shared_lock<std::shared_mutex> read_lock(_qs_ctx_map_lock);
218218
if (_query_statistics_ctx_map.find(query_id) != _query_statistics_ctx_map.end()) {
219219
_query_statistics_ctx_map.at(query_id)->_wg_id = wg_id;
220220
}
221221
}
222222

223-
void RuntimeQueryStatiticsMgr::get_active_be_tasks_block(vectorized::Block* block) {
223+
void RuntimeQueryStatisticsMgr::get_active_be_tasks_block(vectorized::Block* block) {
224224
std::shared_lock<std::shared_mutex> read_lock(_qs_ctx_map_lock);
225225
int64_t be_id = ExecEnv::GetInstance()->master_info()->backend_id;
226226

be/src/runtime/runtime_query_statistics_mgr.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ class QueryStatisticsCtx {
5454
int64_t _query_start_time;
5555
};
5656

57-
class RuntimeQueryStatiticsMgr {
57+
class RuntimeQueryStatisticsMgr {
5858
public:
59-
RuntimeQueryStatiticsMgr() = default;
60-
~RuntimeQueryStatiticsMgr() = default;
59+
RuntimeQueryStatisticsMgr() = default;
60+
~RuntimeQueryStatisticsMgr() = default;
6161

6262
void register_query_statistics(std::string query_id, std::shared_ptr<QueryStatistics> qs_ptr,
6363
TNetworkAddress fe_addr, TQueryType::type query_type);

be/src/runtime/workload_group/workload_group.cpp

+14-11
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,9 @@
4444

4545
namespace doris {
4646

47-
const static uint64_t CPU_SHARE_DEFAULT_VALUE = 1024;
4847
const static std::string MEMORY_LIMIT_DEFAULT_VALUE = "0%";
4948
const static bool ENABLE_MEMORY_OVERCOMMIT_DEFAULT_VALUE = true;
5049
const static int CPU_HARD_LIMIT_DEFAULT_VALUE = -1;
51-
const static uint64_t CPU_SOFT_LIMIT_DEFAULT_VALUE = 1024;
5250
const static int SPILL_LOW_WATERMARK_DEFAULT_VALUE = 50;
5351
const static int SPILL_HIGH_WATERMARK_DEFAULT_VALUE = 80;
5452

@@ -329,7 +327,7 @@ Status WorkloadGroupInfo::parse_topic_info(const TWorkloadGroupInfo& tworkload_g
329327
workload_group_info->version = version;
330328

331329
// 4 cpu_share
332-
uint64_t cpu_share = CPU_SHARE_DEFAULT_VALUE;
330+
uint64_t cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value();
333331
if (tworkload_group_info.__isset.cpu_share) {
334332
cpu_share = tworkload_group_info.cpu_share;
335333
}
@@ -433,14 +431,18 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e
433431

434432
std::lock_guard<std::shared_mutex> wlock(_task_sched_lock);
435433
if (config::doris_cgroup_cpu_path != "" && _cgroup_cpu_ctl == nullptr) {
436-
std::unique_ptr<CgroupCpuCtl> cgroup_cpu_ctl = std::make_unique<CgroupV1CpuCtl>(tg_id);
437-
Status ret = cgroup_cpu_ctl->init();
438-
if (ret.ok()) {
439-
_cgroup_cpu_ctl = std::move(cgroup_cpu_ctl);
440-
LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id;
434+
std::unique_ptr<CgroupCpuCtl> cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(tg_id);
435+
if (cgroup_cpu_ctl) {
436+
Status ret = cgroup_cpu_ctl->init();
437+
if (ret.ok()) {
438+
_cgroup_cpu_ctl = std::move(cgroup_cpu_ctl);
439+
LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id;
440+
} else {
441+
LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id=" << tg_id
442+
<< ", reason=" << ret.to_string();
443+
}
441444
} else {
442-
LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id= " << tg_id
443-
<< ", reason=" << ret.to_string();
445+
LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl for " << tg_id << " failed";
444446
}
445447
}
446448

@@ -533,7 +535,8 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e
533535
if (enable_cpu_hard_limit) {
534536
if (cpu_hard_limit > 0) {
535537
_cgroup_cpu_ctl->update_cpu_hard_limit(cpu_hard_limit);
536-
_cgroup_cpu_ctl->update_cpu_soft_limit(CPU_SOFT_LIMIT_DEFAULT_VALUE);
538+
_cgroup_cpu_ctl->update_cpu_soft_limit(
539+
CgroupCpuCtl::cpu_soft_limit_default_value());
537540
} else {
538541
LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is illegal: "
539542
<< cpu_hard_limit << ", gid=" << tg_id;

be/src/runtime/workload_group/workload_group_manager.cpp

+7-20
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set<uint64_t> used_wg_i
9292
}
9393
// wg is shutdown and running rum = 0, its resource can be released in BE
9494
if (workload_group_ptr->can_be_dropped()) {
95-
LOG(INFO) << "[topic_publish_wg]There is no query in wg" << wg_id << ", delete it.";
95+
LOG(INFO) << "[topic_publish_wg]There is no query in wg " << wg_id
96+
<< ", delete it.";
9697
deleted_task_groups.push_back(workload_group_ptr);
9798
}
9899
}
@@ -121,30 +122,16 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set<uint64_t> used_wg_i
121122
// Using cgdelete has no such issue.
122123
{
123124
if (config::doris_cgroup_cpu_path != "") {
124-
std::lock_guard<std::shared_mutex> write_lock(_init_cg_ctl_lock);
125-
if (!_cg_cpu_ctl) {
126-
_cg_cpu_ctl = std::make_unique<CgroupV1CpuCtl>();
127-
}
128-
if (!_is_init_succ) {
129-
Status ret = _cg_cpu_ctl->init();
130-
if (ret.ok()) {
131-
_is_init_succ = true;
132-
} else {
133-
LOG(INFO) << "[topic_publish_wg]init workload group mgr cpu ctl failed, "
134-
<< ret.to_string();
135-
}
136-
}
137-
if (_is_init_succ) {
138-
Status ret = _cg_cpu_ctl->delete_unused_cgroup_path(used_wg_id);
139-
if (!ret.ok()) {
140-
LOG(WARNING) << "[topic_publish_wg]" << ret.to_string();
141-
}
125+
std::lock_guard<std::shared_mutex> write_lock(_clear_cgroup_lock);
126+
Status ret = CgroupCpuCtl::delete_unused_cgroup_path(used_wg_id);
127+
if (!ret.ok()) {
128+
LOG(WARNING) << "[topic_publish_wg]" << ret.to_string();
142129
}
143130
}
144131
}
145132
int64_t time_cost_ms = MonotonicMillis() - begin_time;
146133
LOG(INFO) << "[topic_publish_wg]finish clear unused workload group, time cost: " << time_cost_ms
147-
<< "ms, deleted group size:" << deleted_task_groups.size()
134+
<< " ms, deleted group size:" << deleted_task_groups.size()
148135
<< ", before wg size=" << old_wg_size << ", after wg size=" << new_wg_size;
149136
}
150137

be/src/runtime/workload_group/workload_group_manager.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,7 @@ class WorkloadGroupMgr {
6666
std::shared_mutex _group_mutex;
6767
std::unordered_map<uint64_t, WorkloadGroupPtr> _workload_groups;
6868

69-
std::shared_mutex _init_cg_ctl_lock;
70-
std::unique_ptr<CgroupCpuCtl> _cg_cpu_ctl;
71-
bool _is_init_succ = false;
69+
std::shared_mutex _clear_cgroup_lock;
7270
};
7371

7472
} // namespace doris

fe/fe-core/src/main/java/org/apache/doris/resource/workloadgroup/WorkloadGroup.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ public void getProcNodeData(BaseProcResult result, QueryQueue qq) {
479479
row.add(val + "%");
480480
}
481481
} else if (CPU_SHARE.equals(key) && !properties.containsKey(key)) {
482-
row.add("1024");
482+
row.add("-1");
483483
} else if (MEMORY_LIMIT.equals(key) && !properties.containsKey(key)) {
484484
row.add("0%");
485485
} else if (ENABLE_MEMORY_OVERCOMMIT.equals(key) && !properties.containsKey(key)) {

0 commit comments

Comments
 (0)