From 9402e1134559f5e59e6d84fa1e0f9adaa4e8717e Mon Sep 17 00:00:00 2001 From: marising Date: Mon, 1 Jun 2020 15:10:06 +0800 Subject: [PATCH 1/4] fix the crash of checksum task 1. the table include key colum of double/float type 2. when run checksum task, will use all of key columns to compare 3. schema.column(idx) of double/float type is NULL --- be/src/olap/row.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/be/src/olap/row.h b/be/src/olap/row.h index b8919706ecf32f..d35566a730f806 100644 --- a/be/src/olap/row.h +++ b/be/src/olap/row.h @@ -59,6 +59,10 @@ bool equal_row(const std::vector& ids, template int compare_row(const LhsRowType& lhs, const RhsRowType& rhs) { for (uint32_t cid = 0; cid < lhs.schema()->num_key_columns(); ++cid) { + //because the num_column_ids include the column of double/float type + if (lhs.schema()->column(cid) == NULL) { + continue; + } auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid)); if (res != 0) { return res; @@ -76,6 +80,10 @@ template int compare_row_key(const LhsRowType& lhs, const RhsRowType& rhs) { auto cmp_cids = std::min(lhs.schema()->num_column_ids(), rhs.schema()->num_column_ids()); for (uint32_t cid = 0; cid < cmp_cids; ++cid) { + //because the num_column_ids include the column of double/float type + if (lhs.schema()->column(cid) == NULL) { + continue; + } auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid)); if (res != 0) { return res; From 9bb5f4bc9846675073aebd1b73e271b8d1b77988 Mon Sep 17 00:00:00 2001 From: marising Date: Tue, 2 Jun 2020 20:55:44 +0800 Subject: [PATCH 2/4] Solve the bug of float/double type column when checksum --- be/src/olap/row.h | 13 +++++-------- be/src/olap/task/engine_checksum_task.cpp | 13 +++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/be/src/olap/row.h b/be/src/olap/row.h index d35566a730f806..552f2d6957d3a6 100644 --- a/be/src/olap/row.h +++ b/be/src/olap/row.h @@ -59,10 +59,6 @@ bool equal_row(const std::vector& ids, template int compare_row(const LhsRowType& lhs, const RhsRowType& rhs) { for (uint32_t cid = 0; cid < lhs.schema()->num_key_columns(); ++cid) { - //because the num_column_ids include the column of double/float type - if (lhs.schema()->column(cid) == NULL) { - continue; - } auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid)); if (res != 0) { return res; @@ -80,10 +76,6 @@ template int compare_row_key(const LhsRowType& lhs, const RhsRowType& rhs) { auto cmp_cids = std::min(lhs.schema()->num_column_ids(), rhs.schema()->num_column_ids()); for (uint32_t cid = 0; cid < cmp_cids; ++cid) { - //because the num_column_ids include the column of double/float type - if (lhs.schema()->column(cid) == NULL) { - continue; - } auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid)); if (res != 0) { return res; @@ -194,7 +186,12 @@ void agg_finalize_row(const std::vector& ids, RowType* row, MemPool* m template uint32_t hash_row(const RowType& row, uint32_t seed) { + FieldType type; for (uint32_t cid : row.schema()->column_ids()) { + type = row.schema()->column(cid)->type(); + if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { + continue; + } seed = row.schema()->column(cid)->hash_code(row.cell(cid), seed); } return seed; diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index c8ca28fbcb138e..28c0ef11e31757 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -80,13 +80,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { } } - // ignore float and double type considering to precision lose for (size_t i = 0; i < tablet->tablet_schema().num_columns(); ++i) { - FieldType type = tablet->tablet_schema().column(i).type(); - if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { - continue; - } - reader_params.return_columns.push_back(i); } @@ -109,6 +103,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { bool eof = false; uint32_t row_checksum = 0; + uint32_t one_checksum; while (true) { OLAPStatus res = reader.next_row_with_aggregation(&row, mem_pool.get(), agg_object_pool.get(), &eof); if (res == OLAP_SUCCESS && eof) { @@ -118,8 +113,10 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res); return res; } - - row_checksum = hash_row(row, row_checksum); + one_checksum = 0; + one_checksum = hash_row(row, one_checksum); + // The value of checksum is independent of the sorting of data rows. + row_checksum = row_checksum ^ one_checksum; // the memory allocate by mem pool has been copied, // so we should release memory immediately mem_pool->clear(); From 75e85de72ae186d3229303bd2b636dbc1d2bec69 Mon Sep 17 00:00:00 2001 From: marising Date: Thu, 4 Jun 2020 15:27:36 +0800 Subject: [PATCH 3/4] add comment for ignore float/double when compute hash code --- be/src/olap/row.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/be/src/olap/row.h b/be/src/olap/row.h index 552f2d6957d3a6..81988622f13999 100644 --- a/be/src/olap/row.h +++ b/be/src/olap/row.h @@ -189,6 +189,8 @@ uint32_t hash_row(const RowType& row, uint32_t seed) { FieldType type; for (uint32_t cid : row.schema()->column_ids()) { type = row.schema()->column(cid)->type(); + //The approximation of float/double in a certain precision range, the binary of byte is not + //a fixed value, so these two types are ignored in calculating hash code. if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { continue; } From 3ebbe4b32198f4f749311b735e13ed2fd978fe4a Mon Sep 17 00:00:00 2001 From: marising Date: Fri, 5 Jun 2020 13:54:50 +0800 Subject: [PATCH 4/4] Simplified some code --- be/src/olap/row.h | 7 +++---- be/src/olap/task/engine_checksum_task.cpp | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/be/src/olap/row.h b/be/src/olap/row.h index 81988622f13999..f24540e13ebff1 100644 --- a/be/src/olap/row.h +++ b/be/src/olap/row.h @@ -186,11 +186,10 @@ void agg_finalize_row(const std::vector& ids, RowType* row, MemPool* m template uint32_t hash_row(const RowType& row, uint32_t seed) { - FieldType type; for (uint32_t cid : row.schema()->column_ids()) { - type = row.schema()->column(cid)->type(); - //The approximation of float/double in a certain precision range, the binary of byte is not - //a fixed value, so these two types are ignored in calculating hash code. + FieldType type = row.schema()->column(cid)->type(); + // The approximation of float/double in a certain precision range, the binary of byte is not + // a fixed value, so these two types are ignored in calculating hash code. if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) { continue; } diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp index 28c0ef11e31757..d4624ad2008361 100644 --- a/be/src/olap/task/engine_checksum_task.cpp +++ b/be/src/olap/task/engine_checksum_task.cpp @@ -103,7 +103,6 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { bool eof = false; uint32_t row_checksum = 0; - uint32_t one_checksum; while (true) { OLAPStatus res = reader.next_row_with_aggregation(&row, mem_pool.get(), agg_object_pool.get(), &eof); if (res == OLAP_SUCCESS && eof) { @@ -113,10 +112,8 @@ OLAPStatus EngineChecksumTask::_compute_checksum() { OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res); return res; } - one_checksum = 0; - one_checksum = hash_row(row, one_checksum); // The value of checksum is independent of the sorting of data rows. - row_checksum = row_checksum ^ one_checksum; + row_checksum ^= hash_row(row, 0); // the memory allocate by mem pool has been copied, // so we should release memory immediately mem_pool->clear();