Skip to content

Commit 975beea

Browse files
authored
[fix](group commit) make group commit cancel in time (apache#36249)
## Proposed changes If group commit time interval is larger than the load timeout, and there is no new client load to reuse the internal group commit load, the group commit can not cancel in time because it stuck in wait: ``` #0 0x00007f33937a47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00005651105dbd05 in __gthread_cond_timedwait(pthread_cond_t*, pthread_mutex_t*, timespec const*) () apache#2 0x000056511063f385 in std::__condvar::wait_until(std::mutex&, timespec&) () apache#3 0x000056511063dc2e in std::cv_status std::condition_variable::__wait_until_impl<std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) () apache#4 0x000056511063cedf in std::cv_status std::condition_variable::wait_until<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) () apache#5 0x0000565110824f48 in std::cv_status std::condition_variable::wait_for<long, std::ratio<1l, 1000l> >(std::unique_lock<std::mutex>&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&) () apache#6 0x0000565113b5612a in doris::LoadBlockQueue::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*, bool*) () apache#7 0x000056513f900941 in doris::pipeline::GroupCommitOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) () apache#8 0x000056513c69c0b6 in doris::pipeline::ScanOperatorX<doris::pipeline::GroupCommitLocalState>::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) () apache#9 0x000056514009d5f1 in doris::pipeline::PipelineTask::execute(bool*) () apache#10 0x00005651400fb24a in doris::pipeline::TaskScheduler::_do_work(unsigned long) () ```
1 parent 9a125d3 commit 975beea

File tree

2 files changed

+56
-1
lines changed

2 files changed

+56
-1
lines changed

be/src/runtime/group_commit_mgr.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ Status LoadBlockQueue::get_block(RuntimeState* runtime_state, vectorized::Block*
149149
<< ", runtime_state=" << runtime_state;
150150
}
151151
}
152-
_get_cond.wait_for(l, std::chrono::milliseconds(left_milliseconds));
152+
_get_cond.wait_for(l, std::chrono::milliseconds(std::min(left_milliseconds, 10000L)));
153153
}
154154
if (runtime_state->is_cancelled()) {
155155
auto st = runtime_state->cancel_reason();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
suite("test_group_commit_timeout", "nonConcurrent") {
19+
def tableName = "test_group_commit_timeout"
20+
sql """
21+
CREATE TABLE if not exists ${tableName} (
22+
`id` int(11) NOT NULL,
23+
`name` varchar(100) NULL,
24+
`score` int(11) NULL default "-1"
25+
) ENGINE=OLAP
26+
DUPLICATE KEY(`id`)
27+
DISTRIBUTED BY HASH(`id`) BUCKETS 1
28+
PROPERTIES (
29+
"replication_num" = "1",
30+
"group_commit_interval_ms" = "300000"
31+
);
32+
"""
33+
34+
def query_timeout = sql """show variables where variable_name = 'query_timeout';"""
35+
def insert_timeout = sql """show variables where variable_name = 'insert_timeout';"""
36+
logger.info("query_timeout: ${query_timeout}, insert_timeout: ${insert_timeout}")
37+
38+
long start = System.currentTimeMillis()
39+
try {
40+
sql "SET global query_timeout = 5"
41+
sql "SET global insert_timeout = 5"
42+
43+
sql "set group_commit = sync_mode"
44+
sql "insert into ${tableName} values(1, 'a', 10)"
45+
assertTrue(false)
46+
} catch (Exception e) {
47+
long end = System.currentTimeMillis()
48+
logger.info("failed " + e.getMessage())
49+
assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance"))
50+
assertTrue(end - start <= 60000)
51+
} finally {
52+
sql "SET global query_timeout = ${query_timeout[0][1]}"
53+
sql "SET global insert_timeout = ${insert_timeout[0][1]}"
54+
}
55+
}

0 commit comments

Comments
 (0)