From 9402e1134559f5e59e6d84fa1e0f9adaa4e8717e Mon Sep 17 00:00:00 2001
From: marising <lihaibo@vip.126.com>
Date: Mon, 1 Jun 2020 15:10:06 +0800
Subject: [PATCH 1/4] fix the crash of checksum task 1. the table include key
 colum of double/float type 2. when run checksum task, will use all of key
 columns to compare 3. schema.column(idx) of double/float type is NULL

---
 be/src/olap/row.h | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/be/src/olap/row.h b/be/src/olap/row.h
index b8919706ecf32f..d35566a730f806 100644
--- a/be/src/olap/row.h
+++ b/be/src/olap/row.h
@@ -59,6 +59,10 @@ bool equal_row(const std::vector<uint32_t>& ids,
 template<typename LhsRowType, typename RhsRowType>
 int compare_row(const LhsRowType& lhs, const RhsRowType& rhs) {
     for (uint32_t cid = 0; cid < lhs.schema()->num_key_columns(); ++cid) {
+        //because the num_column_ids include the column of double/float type
+        if (lhs.schema()->column(cid) == NULL) {
+            continue;
+        }
         auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid));
         if (res != 0) {
             return res;
@@ -76,6 +80,10 @@ template<typename LhsRowType, typename RhsRowType>
 int compare_row_key(const LhsRowType& lhs, const RhsRowType& rhs) {
     auto cmp_cids = std::min(lhs.schema()->num_column_ids(), rhs.schema()->num_column_ids());
     for (uint32_t cid = 0; cid < cmp_cids; ++cid) {
+        //because the num_column_ids include the column of double/float type
+        if (lhs.schema()->column(cid) == NULL) {
+            continue;
+        }
         auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid));
         if (res != 0) {
             return res;

From 9bb5f4bc9846675073aebd1b73e271b8d1b77988 Mon Sep 17 00:00:00 2001
From: marising <lihaibo@vip.126.com>
Date: Tue, 2 Jun 2020 20:55:44 +0800
Subject: [PATCH 2/4] Solve the bug of float/double type column when checksum

---
 be/src/olap/row.h                         | 13 +++++--------
 be/src/olap/task/engine_checksum_task.cpp | 13 +++++--------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/be/src/olap/row.h b/be/src/olap/row.h
index d35566a730f806..552f2d6957d3a6 100644
--- a/be/src/olap/row.h
+++ b/be/src/olap/row.h
@@ -59,10 +59,6 @@ bool equal_row(const std::vector<uint32_t>& ids,
 template<typename LhsRowType, typename RhsRowType>
 int compare_row(const LhsRowType& lhs, const RhsRowType& rhs) {
     for (uint32_t cid = 0; cid < lhs.schema()->num_key_columns(); ++cid) {
-        //because the num_column_ids include the column of double/float type
-        if (lhs.schema()->column(cid) == NULL) {
-            continue;
-        }
         auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid));
         if (res != 0) {
             return res;
@@ -80,10 +76,6 @@ template<typename LhsRowType, typename RhsRowType>
 int compare_row_key(const LhsRowType& lhs, const RhsRowType& rhs) {
     auto cmp_cids = std::min(lhs.schema()->num_column_ids(), rhs.schema()->num_column_ids());
     for (uint32_t cid = 0; cid < cmp_cids; ++cid) {
-        //because the num_column_ids include the column of double/float type
-        if (lhs.schema()->column(cid) == NULL) {
-            continue;
-        }
         auto res = lhs.schema()->column(cid)->compare_cell(lhs.cell(cid), rhs.cell(cid));
         if (res != 0) {
             return res;
@@ -194,7 +186,12 @@ void agg_finalize_row(const std::vector<uint32_t>& ids, RowType* row, MemPool* m
 
 template<typename RowType>
 uint32_t hash_row(const RowType& row, uint32_t seed) {
+    FieldType type;
     for (uint32_t cid : row.schema()->column_ids()) {
+        type = row.schema()->column(cid)->type();
+        if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) {
+            continue;
+        }
         seed = row.schema()->column(cid)->hash_code(row.cell(cid), seed);
     }
     return seed;
diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp
index c8ca28fbcb138e..28c0ef11e31757 100644
--- a/be/src/olap/task/engine_checksum_task.cpp
+++ b/be/src/olap/task/engine_checksum_task.cpp
@@ -80,13 +80,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
         }
     }
 
-    // ignore float and double type considering to precision lose
     for (size_t i = 0; i < tablet->tablet_schema().num_columns(); ++i) {
-        FieldType type = tablet->tablet_schema().column(i).type();
-        if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) {
-            continue;
-        }
-
         reader_params.return_columns.push_back(i);
     }
 
@@ -109,6 +103,7 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
 
     bool eof = false;
     uint32_t row_checksum = 0;
+    uint32_t one_checksum;
     while (true) {
         OLAPStatus res = reader.next_row_with_aggregation(&row, mem_pool.get(), agg_object_pool.get(), &eof);
         if (res == OLAP_SUCCESS && eof) {
@@ -118,8 +113,10 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
             OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res);
             return res;
         }
-
-        row_checksum = hash_row(row, row_checksum);
+        one_checksum = 0;
+        one_checksum = hash_row(row, one_checksum);
+        // The value of checksum is independent of the sorting of data rows.
+        row_checksum = row_checksum ^ one_checksum;
         // the memory allocate by mem pool has been copied,
         // so we should release memory immediately
         mem_pool->clear();

From 75e85de72ae186d3229303bd2b636dbc1d2bec69 Mon Sep 17 00:00:00 2001
From: marising <lihaibo@vip.126.com>
Date: Thu, 4 Jun 2020 15:27:36 +0800
Subject: [PATCH 3/4] add comment for ignore float/double when compute hash
 code

---
 be/src/olap/row.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/be/src/olap/row.h b/be/src/olap/row.h
index 552f2d6957d3a6..81988622f13999 100644
--- a/be/src/olap/row.h
+++ b/be/src/olap/row.h
@@ -189,6 +189,8 @@ uint32_t hash_row(const RowType& row, uint32_t seed) {
     FieldType type;
     for (uint32_t cid : row.schema()->column_ids()) {
         type = row.schema()->column(cid)->type();
+        //The approximation of float/double in a certain precision range, the binary of byte is not
+        //a fixed value, so these two types are ignored in calculating hash code.
         if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) {
             continue;
         }

From 3ebbe4b32198f4f749311b735e13ed2fd978fe4a Mon Sep 17 00:00:00 2001
From: marising <lihaibo@vip.126.com>
Date: Fri, 5 Jun 2020 13:54:50 +0800
Subject: [PATCH 4/4] Simplified some code

---
 be/src/olap/row.h                         | 7 +++----
 be/src/olap/task/engine_checksum_task.cpp | 5 +----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/be/src/olap/row.h b/be/src/olap/row.h
index 81988622f13999..f24540e13ebff1 100644
--- a/be/src/olap/row.h
+++ b/be/src/olap/row.h
@@ -186,11 +186,10 @@ void agg_finalize_row(const std::vector<uint32_t>& ids, RowType* row, MemPool* m
 
 template<typename RowType>
 uint32_t hash_row(const RowType& row, uint32_t seed) {
-    FieldType type;
     for (uint32_t cid : row.schema()->column_ids()) {
-        type = row.schema()->column(cid)->type();
-        //The approximation of float/double in a certain precision range, the binary of byte is not
-        //a fixed value, so these two types are ignored in calculating hash code.
+        FieldType type = row.schema()->column(cid)->type();
+        // The approximation of float/double in a certain precision range, the binary of byte is not
+        // a fixed value, so these two types are ignored in calculating hash code.
         if (type == OLAP_FIELD_TYPE_FLOAT || type == OLAP_FIELD_TYPE_DOUBLE) {
             continue;
         }
diff --git a/be/src/olap/task/engine_checksum_task.cpp b/be/src/olap/task/engine_checksum_task.cpp
index 28c0ef11e31757..d4624ad2008361 100644
--- a/be/src/olap/task/engine_checksum_task.cpp
+++ b/be/src/olap/task/engine_checksum_task.cpp
@@ -103,7 +103,6 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
 
     bool eof = false;
     uint32_t row_checksum = 0;
-    uint32_t one_checksum;
     while (true) {
         OLAPStatus res = reader.next_row_with_aggregation(&row, mem_pool.get(), agg_object_pool.get(), &eof);
         if (res == OLAP_SUCCESS && eof) {
@@ -113,10 +112,8 @@ OLAPStatus EngineChecksumTask::_compute_checksum() {
             OLAP_LOG_WARNING("fail to read in reader. [res=%d]", res);
             return res;
         }
-        one_checksum = 0;
-        one_checksum = hash_row(row, one_checksum);
         // The value of checksum is independent of the sorting of data rows.
-        row_checksum = row_checksum ^ one_checksum;
+        row_checksum ^= hash_row(row, 0);
         // the memory allocate by mem pool has been copied,
         // so we should release memory immediately
         mem_pool->clear();