Skip to content

Commit d21d936

Browse files
committed
[Fix](parquet-reader) Fix parquet reader crash in set_dict(). (apache#40643)
## Proposed changes ### Issue ``` *** is nereids: 1 *** tablet id: 4 Abort at 1725864966 (unix time) try "date -d @1725864966" if you are using GNU date *** *** Set a breakpoint in static void __GI_abort() to debug *** PC: @ 0x7f007fb4090a04 *** SIGSEGV (address not mapped to object 0xa0fa868a41d6) received by PID 404737 (TID 274135 OR 0x7ece29df700) from PID 1755584205; stack trace: *** #0 __GI_raise #1 __GI_abort apache#2 sig_handler apache#3 _sigaction apache#4 JVM_handle_linux_signal apache#5 _sigaction apache#6 doris::vectorized::ByteArrayDictDecoder::set_dict(std::unique_ptr<unsigned char[], std::default_delete<unsigned char[]>> &&, int, unsigned long) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp:41 apache#7 doris::vectorized::ColumnChunkReader::_decode_dict_page() at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:258 apache#8 doris::vectorized::ColumnChunkReader::next_page() at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:105 apache#9 doris::vectorized::ParquetColumnReader::_read_column_data(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:508 apache#10 doris::vectorized::ScalarColumnReader::_next_value(doris::vectorized::ICollumn*, unsigned long, unsigned long*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:699 apache#11 doris::vectorized::RowGroupReader::_read_column_data(doris::vectorized::Block*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>> &, std::vector<doris::vectorized::ColumnSelectVector>*, unsigned long, unsigned long*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:425 apache#12 doris::vectorized::RowGroupReader::get_next_block(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:311 apache#13 doris::vectorized::ParquetReader::get_next(doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_reader.cpp:533 apache#14 doris::vectorized::VFileScanner::_get_next_reader_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:368 apache#15 doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:411 apache#16 doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:431 apache#17 doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:96 apache#18 doris::vectorized::ScannerScheduler::submit(doris::vectorized::ScannerContext*, std::shared_ptr<doris::vectorized::ScanTask>) at /mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/scanner_context.cpp:96 apache#19 doris::Thread::supervise_thread(void*) at /mnt/disk1/yy/git/enterprise-core/be/src/util/thread.cpp:499 apache#20 start_thread apache#21 clone in /lib64/libc.so.6 ``` ### Solution It is not known why the parquet dictionary page will be null in this case, causing a crash. This PR adds defensive code to prevent the crash.
1 parent 1259fe2 commit d21d936

File tree

2 files changed

+6
-0
lines changed

2 files changed

+6
-0
lines changed

be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ namespace doris::vectorized {
3232
Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length,
3333
size_t num_values) {
3434
_dict = std::move(dict);
35+
if (_dict == nullptr) {
36+
return Status::Corruption("Wrong dictionary data for byte array type, dict is null.");
37+
}
3538
_dict_items.reserve(num_values);
3639
uint32_t offset_cursor = 0;
3740
char* dict_item_address = reinterpret_cast<char*>(_dict.get());

be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
107107
return Status::Corruption("Wrong dictionary data for fixed length type");
108108
}
109109
_dict = std::move(dict);
110+
if (_dict == nullptr) {
111+
return Status::Corruption("Wrong dictionary data for byte array type, dict is null.");
112+
}
110113
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
111114
_dict_items.resize(num_values);
112115
for (size_t i = 0; i < num_values; ++i) {

0 commit comments

Comments
 (0)