Skip to content

Commit 3da2d1c

Browse files
authored
[bug](parquet)Fix the problem that the parquet reader reads the missing sub-columns of the struct and fails. (#38718) (#39192)
bp #38718
1 parent 0ee0dd6 commit 3da2d1c

File tree

16 files changed

+616
-8
lines changed

16 files changed

+616
-8
lines changed

be/src/vec/exec/format/parquet/vparquet_column_reader.cpp

+14-8
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,6 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
352352
SCOPED_RAW_TIMER(&_decode_null_map_time);
353353
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
354354
static_cast<const vectorized::ColumnNullable*>(doris_column.get()));
355-
356-
// auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
357-
// (*std::move(src_column)).mutate().get());
358355
data_column = nullable_column->get_nested_column_ptr();
359356
map_data_column = &(nullable_column->get_null_map_data());
360357
} else {
@@ -723,7 +720,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
723720
const DataTypeStruct* doris_struct_type =
724721
reinterpret_cast<const DataTypeStruct*>(remove_nullable(type).get());
725722

726-
bool least_one_reader = false;
723+
int not_missing_column_id = -1;
727724
std::vector<size_t> missing_column_idxs {};
728725

729726
_read_column_names.clear();
@@ -744,8 +741,8 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
744741
select_vector.reset();
745742
size_t field_rows = 0;
746743
bool field_eof = false;
747-
if (!least_one_reader) {
748-
least_one_reader = true;
744+
if (not_missing_column_id == -1) {
745+
not_missing_column_id = i;
749746
RETURN_IF_ERROR(_child_readers[doris_name]->read_column_data(
750747
doris_field, doris_type, select_vector, batch_size, &field_rows, &field_eof,
751748
is_dict_filter));
@@ -765,20 +762,29 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
765762
}
766763
}
767764

768-
if (!least_one_reader) {
765+
if (not_missing_column_id == -1) {
769766
// TODO: support read struct which columns are all missing
770767
return Status::Corruption("Not support read struct '{}' which columns are all missing",
771768
_field_schema->name);
772769
}
773770

771+
// This missing_column_sz is not *read_rows. Because read_rows returns the number of rows.
772+
// For example: suppose we have a column array<struct<a:int,b:string>>,
773+
// where b is a newly added column, that is, a missing column.
774+
// There are two rows of data in this column,
775+
// [{1,null},{2,null},{3,null}]
776+
// [{4,null},{5,null}]
777+
// When you first read subcolumn a, you read 5 data items and the value of *read_rows is 2.
778+
// You should insert 5 records into subcolumn b instead of 2.
779+
auto missing_column_sz = doris_struct.get_column(not_missing_column_id).size();
774780
// fill missing column with null or default value
775781
for (auto idx : missing_column_idxs) {
776782
auto& doris_field = doris_struct.get_column_ptr(idx);
777783
auto& doris_type = const_cast<DataTypePtr&>(doris_struct_type->get_element(idx));
778784
DCHECK(doris_type->is_nullable());
779785
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
780786
(*std::move(doris_field)).mutate().get());
781-
nullable_column->insert_null_elements(*read_rows);
787+
nullable_column->insert_null_elements(missing_column_sz);
782788
}
783789

784790
if (null_map_ptr != nullptr) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
use `default`;
2+
3+
create table test_hive_struct_add_column_orc (
4+
`id` int,
5+
`name` string,
6+
`details` struct<age:int,city:string,email:string,phone:int>,
7+
`sex` int,
8+
`complex` array<struct<a:int,b:struct<aa:string,bb:int>>>
9+
)
10+
STORED AS ORC
11+
LOCATION '/user/doris/preinstalled_data/orc_table/test_hive_struct_add_column_orc';
12+
13+
create table test_hive_struct_add_column_parquet (
14+
`id` int,
15+
`name` string,
16+
`details` struct<age:int,city:string,email:string,phone:int>,
17+
`sex` int,
18+
`complex` array<struct<a:int,b:struct<aa:string,bb:int>>>
19+
)
20+
STORED AS parquet
21+
LOCATION '/user/doris/preinstalled_data/parquet_table/test_hive_struct_add_column_parquet';
22+

regression-test/data/external_table_p0/hive/test_hive_struct_add_column.out

+411
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
19+
20+
suite("test_hive_struct_add_column", "all_types,p0,external,hive,external_docker,external_docker_hive") {
21+
22+
23+
String enabled = context.config.otherConfigs.get("enableHiveTest")
24+
if (enabled != null && enabled.equalsIgnoreCase("true")) {
25+
String hivePrefix ="hive3";
26+
setHivePrefix(hivePrefix)
27+
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
28+
String hmsPort = context.config.otherConfigs.get(hivePrefix + "HmsPort")
29+
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
30+
31+
String catalog_name = "test_hive_struct_add_column"
32+
sql """drop catalog if exists ${catalog_name};"""
33+
sql """
34+
create catalog if not exists ${catalog_name} properties (
35+
'type'='hms',
36+
'hadoop.username' = 'hadoop',
37+
'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
38+
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}'
39+
);
40+
"""
41+
42+
sql """use `${catalog_name}`.`default`"""
43+
44+
qt_desc """ desc test_hive_struct_add_column_orc;"""
45+
qt_test_1 """ select * from test_hive_struct_add_column_orc order by id;"""
46+
qt_test_2 """ select * from test_hive_struct_add_column_orc where id = 1 order by id;"""
47+
qt_test_3 """ select * from test_hive_struct_add_column_orc where complex is null order by id;"""
48+
qt_test_4 """ select * from test_hive_struct_add_column_orc where complex is not null order by id"""
49+
qt_test_5 """ select complex from test_hive_struct_add_column_orc where complex is null order by id """
50+
qt_test_6 """ select complex from test_hive_struct_add_column_orc where complex is not null order by id """
51+
qt_test_7 """select complex from test_hive_struct_add_column_orc where complex is null order by id; """
52+
qt_test_8 """select complex from test_hive_struct_add_column_orc where complex is not null order by id;"""
53+
qt_test_9 """select sex from test_hive_struct_add_column_orc where sex = 0 order by id;"""
54+
qt_test_10 """select sex from test_hive_struct_add_column_orc where sex = 1 order by id;"""
55+
qt_test_11 """select sex from test_hive_struct_add_column_orc where sex = 2 order by id;"""
56+
qt_test_12 """select * from test_hive_struct_add_column_orc where sex = 2 order by id; """
57+
qt_test_13 """select * from test_hive_struct_add_column_orc where id =sex order by id;"""
58+
qt_test_14 """select * from test_hive_struct_add_column_orc where id -52=sex order by id;"""
59+
qt_test_15 """select *,complex[1] from test_hive_struct_add_column_orc where struct_element(complex[1],1) = 1 order by id;"""
60+
qt_test_16 """ select complex from test_hive_struct_add_column_orc where struct_element(complex[1],1) = 2 and struct_element(complex[1],2) is null order by id ; """
61+
qt_test_17 """select details from test_hive_struct_add_column_orc where struct_element(details,1) = 25 and struct_element(details,4) is not null order by id;"""
62+
qt_test_18 """select details from test_hive_struct_add_column_orc where struct_element(details,1) = 25 and struct_element(details,4) is null order by id;"""
63+
qt_test_19 """ select details,id from test_hive_struct_add_column_orc where struct_element(details,1) = 25 and struct_element(details,4) is not null order by id ;"""
64+
qt_test_20 """ select details,id from test_hive_struct_add_column_orc where struct_element(details,1) = 25 and struct_element(details,4) is null order by id;"""
65+
qt_test_21 """ select sex,count(*) from test_hive_struct_add_column_orc group by sex order by count(*);"""
66+
67+
68+
69+
qt_desc """ desc test_hive_struct_add_column_parquet;"""
70+
qt_test_1 """ select * from test_hive_struct_add_column_parquet order by id;"""
71+
qt_test_2 """ select * from test_hive_struct_add_column_parquet where id = 1 order by id;"""
72+
qt_test_3 """ select * from test_hive_struct_add_column_parquet where complex is null order by id;"""
73+
qt_test_4 """ select * from test_hive_struct_add_column_parquet where complex is not null order by id"""
74+
qt_test_5 """ select complex from test_hive_struct_add_column_parquet where complex is null order by id """
75+
qt_test_6 """ select complex from test_hive_struct_add_column_parquet where complex is not null order by id """
76+
qt_test_7 """select complex from test_hive_struct_add_column_parquet where complex is null order by id; """
77+
qt_test_8 """select complex from test_hive_struct_add_column_parquet where complex is not null order by id;"""
78+
qt_test_9 """select sex from test_hive_struct_add_column_parquet where sex = 0 order by id;"""
79+
qt_test_10 """select sex from test_hive_struct_add_column_parquet where sex = 1 order by id;"""
80+
qt_test_11 """select sex from test_hive_struct_add_column_parquet where sex = 2 order by id;"""
81+
qt_test_12 """select * from test_hive_struct_add_column_parquet where sex = 2 order by id; """
82+
qt_test_13 """select * from test_hive_struct_add_column_parquet where id =sex order by id;"""
83+
qt_test_14 """select * from test_hive_struct_add_column_parquet where id -52=sex order by id;"""
84+
qt_test_15 """select *,complex[1] from test_hive_struct_add_column_parquet where struct_element(complex[1],1) = 1 order by id;"""
85+
qt_test_16 """ select complex from test_hive_struct_add_column_parquet where struct_element(complex[1],1) = 2 and struct_element(complex[1],2) is null order by id ; """
86+
qt_test_17 """select details from test_hive_struct_add_column_parquet where struct_element(details,1) = 25 and struct_element(details,4) is not null order by id;"""
87+
qt_test_18 """select details from test_hive_struct_add_column_parquet where struct_element(details,1) = 25 and struct_element(details,4) is null order by id;"""
88+
qt_test_19 """ select details,id from test_hive_struct_add_column_parquet where struct_element(details,1) = 25 and struct_element(details,4) is not null order by id ;"""
89+
qt_test_20 """ select details,id from test_hive_struct_add_column_parquet where struct_element(details,1) = 25 and struct_element(details,4) is null order by id;"""
90+
qt_test_21 """ select sex,count(*) from test_hive_struct_add_column_parquet group by sex order by count(*);"""
91+
92+
93+
94+
sql """drop catalog if exists ${catalog_name}"""
95+
}
96+
}
97+
98+
/*
99+
drop table user_info_orc;
100+
CREATE TABLE user_info_orc (
101+
id INT,
102+
name STRING,
103+
details STRUCT<age:INT, city:STRING>
104+
)
105+
stored as orc;
106+
INSERT INTO TABLE user_info_orc
107+
VALUES
108+
(1, 'Alice', named_struct('age', 25, 'city', 'New York')),
109+
(2, 'Blice', named_struct('age', 26, 'city', 'New York New York')),
110+
(3, 'Clice', named_struct('age', 27, 'city', 'New York New York New York')),
111+
(4, 'Dlice', named_struct('age', 28, 'city', 'New York New York New York New York')),
112+
(5, 'Elice', named_struct('age', 29, 'city', 'New York New York New York New York New York'));
113+
ALTER TABLE user_info_orc CHANGE COLUMN details details STRUCT<age:INT, city:STRING, email:STRING>;
114+
INSERT INTO TABLE user_info_orc
115+
VALUES
116+
(11, 'AAlice', named_struct('age', 125, 'city', 'acity', 'email', '[email protected]')),
117+
(12, 'BBlice', named_struct('age', 126, 'city', 'bcity', 'email', '[email protected]')),
118+
(13, 'CClice', named_struct('age', 127, 'city', 'ccity', 'email', '[email protected]')),
119+
(14, 'DDlice', named_struct('age', 128, 'city', 'dcity', 'email', '[email protected]')),
120+
(15, 'EElice', named_struct('age', 129, 'city', 'ecity', 'email', NULL));
121+
ALTER TABLE user_info_orc CHANGE COLUMN details details STRUCT<age:INT, city:STRING, email:STRING, phone:int>;
122+
INSERT INTO user_info_orc
123+
VALUES
124+
(21, 'Charlie', named_struct('age', 218, 'city', 'San Francisco', 'email', '[email protected]','phone',123)),
125+
(22, 'Charlie', named_struct('age', 228, 'city', 'San-Francisco', 'email', '[email protected]','phone',1234)),
126+
(23, 'Charlie', named_struct('age', 238, 'city', 'SanxFrancisco', 'email', '[email protected]','phone',12345)),
127+
(24, 'Charlie', named_struct('age', 248, 'city', 'San888Francisco', 'email', '[email protected]','phone',123456)),
128+
(25, 'Charlie', named_struct('age', 258, 'city', 'San0000Francisco', 'email', '[email protected]','phone',NULL));
129+
desc user_info_orc;
130+
ALTER TABLE user_info_orc add columns (sex int);
131+
INSERT INTO TABLE user_info_orc
132+
VALUES
133+
(31, 'Alice', named_struct('age', 25, 'city', 'New York', 'email', '[email protected]', 'phone', 123456),0),
134+
(32, 'Bob', named_struct('age', 30, 'city', 'Los Angeles', 'email', '[email protected]', 'phone', 789012),0),
135+
(33, 'Charlie', named_struct('age', 28, 'city', 'San Francisco', 'email', '[email protected]', 'phone', 456789),1),
136+
(34, 'David', named_struct('age', 32, 'city', 'Chicago', 'email', '[email protected]', 'phone', 987654),0),
137+
(35, 'Eve', named_struct('age', 27, 'city', 'Seattle', 'email', '[email protected]', 'phone', NULL),NULL);
138+
ALTER TABLE user_info_orc add columns (complex array<struct<a:int>>);
139+
INSERT INTO TABLE user_info_orc
140+
VALUES
141+
(41,'Alice', named_struct('age', 25, 'city', 'New York', 'email', '[email protected]', 'phone', 123456), 1, array(named_struct('a', 1),named_struct('a', 1))),
142+
(42,'Bob', named_struct('age', 30, 'city', 'Los Angeles', 'email', '[email protected]', 'phone', 789012), 1, array(named_struct('a', 2),named_struct('a', 1))),
143+
(43,'Charlie', named_struct('age', 28, 'city', 'San Francisco', 'email', '[email protected]', 'phone', 456789), 2, array(named_struct('a', 3),named_struct('a', 1))),
144+
(44,'David', named_struct('age', 32, 'city', 'Chicago', 'email', '[email protected]', 'phone', 987654), 1, array(named_struct('a', 4),named_struct('a', 1))),
145+
(45,'Eve', named_struct('age', 27, 'city', 'Seattle', 'email', '[email protected]', 'phone', 654321), 2, array(named_struct('a', 5),named_struct('a', 1)));
146+
147+
ALTER TABLE user_info_orc CHANGE COLUMN complex complex array<struct<a:int,b:struct<aa:string,bb:int>>>;
148+
INSERT INTO TABLE user_info_orc
149+
VALUES
150+
(51, 'Alice', named_struct('age', 25, 'city', 'New York', 'email', '[email protected]', 'phone', 123456), 1, array(named_struct('a', 1, 'b', named_struct('aa', 'foo', 'bb', 100)),named_struct('a', 1, 'b', named_struct('aa', 'foo', 'bb', 100)))),
151+
(52, 'Bob', named_struct('age', 30, 'city', 'Los Angeles', 'email', '[email protected]', 'phone', 789012), 2, array(named_struct('a', 2, 'b', named_struct('aa', 'bar', 'bb', 200)))),
152+
(53, 'Charlie', named_struct('age', 28, 'city', 'San Francisco', 'email', '[email protected]', 'phone', 456789), 1, array(named_struct('a', 3, 'b', named_struct('aa', 'baz', 'bb', 300)))),
153+
(54, 'David', named_struct('age', 32, 'city', 'Chicago', 'email', '[email protected]', 'phone', 987654), 2, array(named_struct('a', 8, 'b', named_struct('aa', 'qux', 'bb', 400)))),
154+
(55, 'Eve', named_struct('age', 27, 'city', 'Seattle', 'email', '[email protected]', 'phone', 654321), 1, array(named_struct('a', 5, 'b', named_struct('aa', 'abcd', 'bb', 500)),named_struct('a', 5, 'b', named_struct('aa', 'abcdffff', 'bb', 5000)),named_struct('a', 5, 'b', named_struct('aa', 'abcdtttt', 'bb', 500000))));
155+
156+
157+
cp user_info_orc/ => test_hive_struct_add_column_orc/
158+
159+
create table test_hive_struct_add_column_orc (
160+
`id` int,
161+
`name` string,
162+
`details` struct<age:int,city:string,email:string,phone:int>,
163+
`sex` int,
164+
`complex` array<struct<a:int,b:struct<aa:string,bb:int>>>
165+
)
166+
STORED AS ORC;
167+
LOCATION '/user/doris/preinstalled_data/orc_table/test_hive_struct_add_column_orc';
168+
169+
*/

0 commit comments

Comments
 (0)