Skip to content

Commit eda303f

Browse files
[enhancement](memtable) make memtable memusage more accurate (apache#40912)
## Proposed changes 1. Add memtype to memtable, and save a weak ptr vector in memtable writer, so that we could get different memory usage by traverse the vector. 2. Using scoped memory usage to compute the mem usage of a memtable. 3. CHECK if the tracker is 0 when the memtable flush success. --------- Co-authored-by: yiguolei <[email protected]>
1 parent adf7fb1 commit eda303f

10 files changed

+70
-96
lines changed

be/src/common/config.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -604,8 +604,6 @@ DEFINE_mInt32(memtable_hard_limit_active_percent, "50");
604604
// percent of (active memtables size / all memtables size) when reach soft limit
605605
DEFINE_mInt32(memtable_soft_limit_active_percent, "50");
606606

607-
// memtable insert memory tracker will multiply input block size with this ratio
608-
DEFINE_mDouble(memtable_insert_memory_ratio, "1.4");
609607
// max write buffer size before flush, default 200MB
610608
DEFINE_mInt64(write_buffer_size, "209715200");
611609
// max buffer size used in memtable for the aggregated table, default 400MB

be/src/common/config.h

-2
Original file line numberDiff line numberDiff line change
@@ -654,8 +654,6 @@ DECLARE_mInt32(memtable_hard_limit_active_percent);
654654
// percent of (active memtables size / all memtables size) when reach soft limit
655655
DECLARE_mInt32(memtable_soft_limit_active_percent);
656656

657-
// memtable insert memory tracker will multiply input block size with this ratio
658-
DECLARE_mDouble(memtable_insert_memory_ratio);
659657
// max write buffer size before flush, default 200MB
660658
DECLARE_mInt64(write_buffer_size);
661659
// max buffer size used in memtable for the aggregated table, default 400MB

be/src/olap/memtable.cpp

+16-24
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,16 @@ using namespace ErrorCode;
5050

5151
MemTable::MemTable(int64_t tablet_id, std::shared_ptr<TabletSchema> tablet_schema,
5252
const std::vector<SlotDescriptor*>* slot_descs, TupleDescriptor* tuple_desc,
53-
bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info,
54-
const std::shared_ptr<MemTracker>& insert_mem_tracker,
55-
const std::shared_ptr<MemTracker>& flush_mem_tracker)
56-
: _tablet_id(tablet_id),
53+
bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info)
54+
: _mem_type(MemType::ACTIVE),
55+
_tablet_id(tablet_id),
5756
_enable_unique_key_mow(enable_unique_key_mow),
5857
_keys_type(tablet_schema->keys_type()),
5958
_tablet_schema(tablet_schema),
60-
_insert_mem_tracker(insert_mem_tracker),
61-
_flush_mem_tracker(flush_mem_tracker),
6259
_is_first_insertion(true),
6360
_agg_functions(tablet_schema->num_columns()),
6461
_offsets_of_aggregate_states(tablet_schema->num_columns()),
65-
_total_size_of_aggregate_states(0),
66-
_mem_usage(0) {
62+
_total_size_of_aggregate_states(0) {
6763
g_memtable_cnt << 1;
6864
_query_thread_context.init_unlocked();
6965
_arena = std::make_unique<vectorized::Arena>();
@@ -82,6 +78,7 @@ MemTable::MemTable(int64_t tablet_id, std::shared_ptr<TabletSchema> tablet_schem
8278
}
8379
// TODO: Support ZOrderComparator in the future
8480
_init_columns_offset_by_slot_descs(slot_descs, tuple_desc);
81+
_mem_tracker = std::make_shared<MemTracker>();
8582
}
8683

8784
void MemTable::_init_columns_offset_by_slot_descs(const std::vector<SlotDescriptor*>* slot_descs,
@@ -142,6 +139,13 @@ void MemTable::_init_agg_functions(const vectorized::Block* block) {
142139

143140
MemTable::~MemTable() {
144141
SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_thread_context.query_mem_tracker);
142+
if (_is_flush_success) {
143+
// If the memtable is flush success, then its memtracker's consumption should be 0
144+
if (_mem_tracker->consumption() != 0 && config::crash_in_memory_tracker_inaccurate) {
145+
LOG(FATAL) << "memtable flush success but cosumption is not 0, it is "
146+
<< _mem_tracker->consumption();
147+
}
148+
}
145149
g_memtable_input_block_allocated_size << -_input_mutable_block.allocated_bytes();
146150
g_memtable_cnt << -1;
147151
if (_keys_type != KeysType::DUP_KEYS) {
@@ -159,13 +163,7 @@ MemTable::~MemTable() {
159163
}
160164
}
161165
std::for_each(_row_in_blocks.begin(), _row_in_blocks.end(), std::default_delete<RowInBlock>());
162-
_insert_mem_tracker->release(_mem_usage);
163-
_flush_mem_tracker->set_consumption(0);
164-
DCHECK_EQ(_insert_mem_tracker->consumption(), 0) << std::endl
165-
<< _insert_mem_tracker->log_usage();
166-
DCHECK_EQ(_flush_mem_tracker->consumption(), 0);
167166
_arena.reset();
168-
_agg_buffer_pool.clear();
169167
_vec_row_comparator.reset();
170168
_row_in_blocks.clear();
171169
_agg_functions.clear();
@@ -180,6 +178,7 @@ int RowInBlockComparator::operator()(const RowInBlock* left, const RowInBlock* r
180178

181179
Status MemTable::insert(const vectorized::Block* input_block,
182180
const std::vector<uint32_t>& row_idxs) {
181+
SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
183182
if (_is_first_insertion) {
184183
_is_first_insertion = false;
185184
auto clone_block = input_block->clone_without_columns(&_column_offset);
@@ -214,10 +213,6 @@ Status MemTable::insert(const vectorized::Block* input_block,
214213
row_idxs.data() + num_rows, &_column_offset));
215214
auto block_size1 = _input_mutable_block.allocated_bytes();
216215
g_memtable_input_block_allocated_size << block_size1 - block_size0;
217-
auto input_size = size_t(input_block->bytes() * num_rows / input_block->rows() *
218-
config::memtable_insert_memory_ratio);
219-
_mem_usage += input_size;
220-
_insert_mem_tracker->consume(input_size);
221216
for (int i = 0; i < num_rows; i++) {
222217
_row_in_blocks.emplace_back(new RowInBlock {cursor_in_mutableblock + i});
223218
}
@@ -467,10 +462,6 @@ void MemTable::_aggregate() {
467462
}
468463
if constexpr (!is_final) {
469464
// if is not final, we collect the agg results to input_block and then continue to insert
470-
size_t shrunked_after_agg = _output_mutable_block.allocated_bytes();
471-
// flush will not run here, so will not duplicate `_flush_mem_tracker`
472-
_insert_mem_tracker->consume(shrunked_after_agg - _mem_usage);
473-
_mem_usage = shrunked_after_agg;
474465
_input_mutable_block.swap(_output_mutable_block);
475466
//TODO(weixang):opt here.
476467
std::unique_ptr<vectorized::Block> empty_input_block = in_block.create_same_struct_block(0);
@@ -483,6 +474,7 @@ void MemTable::_aggregate() {
483474
}
484475

485476
void MemTable::shrink_memtable_by_agg() {
477+
SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
486478
if (_keys_type == KeysType::DUP_KEYS) {
487479
return;
488480
}
@@ -528,8 +520,8 @@ Status MemTable::_to_block(std::unique_ptr<vectorized::Block>* res) {
528520
}
529521
g_memtable_input_block_allocated_size << -_input_mutable_block.allocated_bytes();
530522
_input_mutable_block.clear();
531-
_insert_mem_tracker->release(_mem_usage);
532-
_mem_usage = 0;
523+
// After to block, all data in arena is saved in the block
524+
_arena.reset();
533525
*res = vectorized::Block::create_unique(_output_mutable_block.to_block());
534526
return Status::OK();
535527
}

be/src/olap/memtable.h

+18-19
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ class TabletSchema;
4747
class TupleDescriptor;
4848
enum KeysType : int;
4949

50+
// Active: the memtable is currently used by writer to insert into blocks
51+
// Write_finished: the memtable finished write blocks and in the queue waiting for flush
52+
// FLUSH: the memtable is under flushing, write segment to disk.
53+
enum MemType { ACTIVE = 0, WRITE_FINISHED = 1, FLUSH = 2 };
54+
5055
// row pos in _input_mutable_block
5156
struct RowInBlock {
5257
size_t _row_pos;
@@ -171,16 +176,11 @@ class MemTable {
171176
public:
172177
MemTable(int64_t tablet_id, std::shared_ptr<TabletSchema> tablet_schema,
173178
const std::vector<SlotDescriptor*>* slot_descs, TupleDescriptor* tuple_desc,
174-
bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info,
175-
const std::shared_ptr<MemTracker>& insert_mem_tracker,
176-
const std::shared_ptr<MemTracker>& flush_mem_tracker);
179+
bool enable_unique_key_mow, PartialUpdateInfo* partial_update_info);
177180
~MemTable();
178181

179182
int64_t tablet_id() const { return _tablet_id; }
180-
size_t memory_usage() const {
181-
return _insert_mem_tracker->consumption() + _arena->used_size() +
182-
_flush_mem_tracker->consumption();
183-
}
183+
size_t memory_usage() const { return _mem_tracker->consumption(); }
184184
// insert tuple from (row_pos) to (row_pos+num_rows)
185185
Status insert(const vectorized::Block* block, const std::vector<uint32_t>& row_idxs);
186186

@@ -196,10 +196,16 @@ class MemTable {
196196

197197
const MemTableStat& stat() { return _stat; }
198198

199-
std::shared_ptr<MemTracker> flush_mem_tracker() { return _flush_mem_tracker; }
200-
201199
QueryThreadContext query_thread_context() { return _query_thread_context; }
202200

201+
std::shared_ptr<MemTracker> mem_tracker() { return _mem_tracker; }
202+
203+
void set_flush_success() { _is_flush_success = true; }
204+
205+
MemType get_mem_type() { return _mem_type; }
206+
207+
void update_mem_type(MemType memtype) { _mem_type = memtype; }
208+
203209
private:
204210
// for vectorized
205211
void _aggregate_two_row_in_block(vectorized::MutableBlock& mutable_block, RowInBlock* new_row,
@@ -209,28 +215,23 @@ class MemTable {
209215
Status _to_block(std::unique_ptr<vectorized::Block>* res);
210216

211217
private:
218+
std::atomic<MemType> _mem_type;
212219
int64_t _tablet_id;
213220
bool _enable_unique_key_mow = false;
214221
bool _is_partial_update = false;
222+
bool _is_flush_success = false;
215223
const KeysType _keys_type;
216224
std::shared_ptr<TabletSchema> _tablet_schema;
217225

218226
std::shared_ptr<RowInBlockComparator> _vec_row_comparator;
219227

220228
QueryThreadContext _query_thread_context;
221229

222-
// `_insert_manual_mem_tracker` manually records the memory value of memtable insert()
223-
// `_flush_hook_mem_tracker` automatically records the memory value of memtable flush() through mem hook.
224-
// Is used to flush when _insert_manual_mem_tracker larger than write_buffer_size and run flush memtable
225-
// when the sum of all memtable (_insert_manual_mem_tracker + _flush_hook_mem_tracker) exceeds the limit.
226-
std::shared_ptr<MemTracker> _insert_mem_tracker;
227-
std::shared_ptr<MemTracker> _flush_mem_tracker;
230+
std::shared_ptr<MemTracker> _mem_tracker;
228231
// Only the rows will be inserted into block can allocate memory from _arena.
229232
// In this way, we can make MemTable::memory_usage() to be more accurate, and eventually
230233
// reduce the number of segment files that are generated by current load
231234
std::unique_ptr<vectorized::Arena> _arena;
232-
// The object buffer pool for convert tuple to row
233-
ObjectPool _agg_buffer_pool;
234235

235236
void _init_columns_offset_by_slot_descs(const std::vector<SlotDescriptor*>* slot_descs,
236237
const TupleDescriptor* tuple_desc);
@@ -264,8 +265,6 @@ class MemTable {
264265
std::vector<size_t> _offsets_of_aggregate_states;
265266
size_t _total_size_of_aggregate_states;
266267
std::vector<RowInBlock*> _row_in_blocks;
267-
// Memory usage without _arena.
268-
size_t _mem_usage;
269268

270269
size_t _num_columns;
271270
int32_t _seq_col_idx_in_block = -1;

be/src/olap/memtable_flush_executor.cpp

+11-13
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ class MemtableFlushTask final : public Runnable {
4646
ENABLE_FACTORY_CREATOR(MemtableFlushTask);
4747

4848
public:
49-
MemtableFlushTask(std::shared_ptr<FlushToken> flush_token, std::unique_ptr<MemTable> memtable,
49+
MemtableFlushTask(std::shared_ptr<FlushToken> flush_token, std::shared_ptr<MemTable> memtable,
5050
int32_t segment_id, int64_t submit_task_time)
5151
: _flush_token(flush_token),
52-
_memtable(std::move(memtable)),
52+
_memtable(memtable),
5353
_segment_id(segment_id),
5454
_submit_task_time(submit_task_time) {
5555
g_flush_task_num << 1;
@@ -60,15 +60,15 @@ class MemtableFlushTask final : public Runnable {
6060
void run() override {
6161
auto token = _flush_token.lock();
6262
if (token) {
63-
token->_flush_memtable(std::move(_memtable), _segment_id, _submit_task_time);
63+
token->_flush_memtable(_memtable, _segment_id, _submit_task_time);
6464
} else {
6565
LOG(WARNING) << "flush token is deconstructed, ignore the flush task";
6666
}
6767
}
6868

6969
private:
7070
std::weak_ptr<FlushToken> _flush_token;
71-
std::unique_ptr<MemTable> _memtable;
71+
std::shared_ptr<MemTable> _memtable;
7272
int32_t _segment_id;
7373
int64_t _submit_task_time;
7474
};
@@ -83,7 +83,7 @@ std::ostream& operator<<(std::ostream& os, const FlushStatistic& stat) {
8383
return os;
8484
}
8585

86-
Status FlushToken::submit(std::unique_ptr<MemTable> mem_table) {
86+
Status FlushToken::submit(std::shared_ptr<MemTable> mem_table) {
8787
{
8888
std::shared_lock rdlk(_flush_status_lock);
8989
DBUG_EXECUTE_IF("FlushToken.submit_flush_error", {
@@ -98,9 +98,8 @@ Status FlushToken::submit(std::unique_ptr<MemTable> mem_table) {
9898
return Status::OK();
9999
}
100100
int64_t submit_task_time = MonotonicNanos();
101-
auto task = MemtableFlushTask::create_shared(shared_from_this(), std::move(mem_table),
102-
_rowset_writer->allocate_segment_id(),
103-
submit_task_time);
101+
auto task = MemtableFlushTask::create_shared(
102+
shared_from_this(), mem_table, _rowset_writer->allocate_segment_id(), submit_task_time);
104103
Status ret = _thread_pool->submit(std::move(task));
105104
if (ret.ok()) {
106105
// _wait_running_task_finish was executed after this function, so no need to notify _cond here
@@ -136,20 +135,19 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in
136135
VLOG_CRITICAL << "begin to flush memtable for tablet: " << memtable->tablet_id()
137136
<< ", memsize: " << memtable->memory_usage()
138137
<< ", rows: " << memtable->stat().raw_rows;
138+
memtable->update_mem_type(MemType::FLUSH);
139139
int64_t duration_ns;
140140
SCOPED_RAW_TIMER(&duration_ns);
141141
SCOPED_ATTACH_TASK(memtable->query_thread_context());
142142
signal::set_signal_task_id(_rowset_writer->load_id());
143143
signal::tablet_id = memtable->tablet_id();
144144
{
145+
SCOPED_CONSUME_MEM_TRACKER(memtable->mem_tracker());
145146
std::unique_ptr<vectorized::Block> block;
146-
// During to block method, it will release old memory and create new block, so that
147-
// we could not scoped it.
148147
RETURN_IF_ERROR(memtable->to_block(&block));
149-
memtable->flush_mem_tracker()->consume(block->allocated_bytes());
150-
SCOPED_CONSUME_MEM_TRACKER(memtable->flush_mem_tracker());
151148
RETURN_IF_ERROR(_rowset_writer->flush_memtable(block.get(), segment_id, flush_size));
152149
}
150+
memtable->set_flush_success();
153151
_memtable_stat += memtable->stat();
154152
DorisMetrics::instance()->memtable_flush_total->increment(1);
155153
DorisMetrics::instance()->memtable_flush_duration_us->increment(duration_ns / 1000);
@@ -158,7 +156,7 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in
158156
return Status::OK();
159157
}
160158

161-
void FlushToken::_flush_memtable(std::unique_ptr<MemTable> memtable_ptr, int32_t segment_id,
159+
void FlushToken::_flush_memtable(std::shared_ptr<MemTable> memtable_ptr, int32_t segment_id,
162160
int64_t submit_task_time) {
163161
Defer defer {[&]() {
164162
std::lock_guard<std::mutex> lock(_mutex);

be/src/olap/memtable_flush_executor.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class FlushToken : public std::enable_shared_from_this<FlushToken> {
6161
public:
6262
FlushToken(ThreadPool* thread_pool) : _flush_status(Status::OK()), _thread_pool(thread_pool) {}
6363

64-
Status submit(std::unique_ptr<MemTable> mem_table);
64+
Status submit(std::shared_ptr<MemTable> mem_table);
6565

6666
// error has happens, so we cancel this token
6767
// And remove all tasks in the queue.
@@ -87,7 +87,7 @@ class FlushToken : public std::enable_shared_from_this<FlushToken> {
8787
private:
8888
friend class MemtableFlushTask;
8989

90-
void _flush_memtable(std::unique_ptr<MemTable> memtable_ptr, int32_t segment_id,
90+
void _flush_memtable(std::shared_ptr<MemTable> memtable_ptr, int32_t segment_id,
9191
int64_t submit_task_time);
9292

9393
Status _do_flush_memtable(MemTable* memtable, int32_t segment_id, int64_t* flush_size);

be/src/olap/memtable_memory_limiter.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <bvar/bvar.h>
2121

2222
#include "common/config.h"
23+
#include "olap/memtable.h"
2324
#include "olap/memtable_writer.h"
2425
#include "util/doris_metrics.h"
2526
#include "util/mem_info.h"
@@ -237,13 +238,14 @@ void MemTableMemoryLimiter::_refresh_mem_tracker() {
237238
_active_writers.clear();
238239
for (auto it = _writers.begin(); it != _writers.end();) {
239240
if (auto writer = it->lock()) {
241+
// The memtable is currently used by writer to insert blocks.
240242
auto active_usage = writer->active_memtable_mem_consumption();
241243
_active_mem_usage += active_usage;
242244
if (active_usage > 0) {
243245
_active_writers.push_back(writer);
244246
}
245247
_flush_mem_usage += writer->mem_consumption(MemType::FLUSH);
246-
_write_mem_usage += writer->mem_consumption(MemType::WRITE);
248+
_write_mem_usage += writer->mem_consumption(MemType::WRITE_FINISHED);
247249
++it;
248250
} else {
249251
*it = std::move(_writers.back());

0 commit comments

Comments
 (0)