Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix] (inverted index) Fix match function without inverted index #38989

Merged
merged 5 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,9 @@ lucene::util::bkd::relation InvertedIndexVisitor<QT>::compare(std::vector<uint8_
Status InvertedIndexIterator::read_from_inverted_index(
const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type,
uint32_t segment_num_rows, std::shared_ptr<roaring::Roaring>& bit_map, bool skip_try) {
DBUG_EXECUTE_IF("return_inverted_index_bypass", {
return Status::Error<ErrorCode::INVERTED_INDEX_BYPASS>("inverted index bypass");
});
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
Expand Down
13 changes: 13 additions & 0 deletions be/src/vec/exprs/vmatch_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@

#include "vec/exprs/vmatch_predicate.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif

#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <fmt/format.h>
#include <fmt/ranges.h> // IWYU pragma: keep
#include <gen_cpp/Exprs_types.h>
Expand All @@ -29,6 +35,7 @@
#include <string_view>
#include <vector>

#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/status.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/core/block.h"
Expand All @@ -53,6 +60,12 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) {
_inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
_inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map;
_analyzer = InvertedIndexReader::create_analyzer(_inverted_index_ctx.get());
_analyzer->set_lowercase(node.match_predicate.parser_lowercase);
if (node.match_predicate.parser_stopwords == "none") {
_analyzer->set_stopwords(nullptr);
} else {
_analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
}
_inverted_index_ctx->analyzer = _analyzer.get();
}

Expand Down
110 changes: 48 additions & 62 deletions be/src/vec/functions/match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,29 @@ inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_ty
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
} else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
} else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
}
return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
}

void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& column_name) const {
VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
query_tokens->emplace_back(match_query_str);
return;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
doris::segment_v2::InvertedIndexReader::get_analyse_result(
*query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
get_query_type_from_fn_name());
}

inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col, int32_t current_block_row_idx,
Expand All @@ -134,23 +153,31 @@ inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
const auto& str_ref = string_col->get_data_at(current_src_array_offset);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
continue;
}
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());

std::vector<std::string> element_tokens;

doris::segment_v2::InvertedIndexReader::get_analyse_result(
element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end());
}
} else {
const auto& str_ref = string_col->get_data_at(current_block_row_idx);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
str_ref.to_string());

doris::segment_v2::InvertedIndexReader::get_analyse_result(data_tokens, reader.get(),
inverted_index_ctx->analyzer,
column_name, query_type, false);
if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
data_tokens.emplace_back(str_ref.to_string());
} else {
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
inverted_index_ctx, str_ref.to_string());
doris::segment_v2::InvertedIndexReader::get_analyse_result(
data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
query_type, false);
}
}
return data_tokens;
}
Expand All @@ -177,23 +204,14 @@ Status FunctionMatchAny::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAny::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -224,23 +242,14 @@ Status FunctionMatchAll::execute_match(FunctionContext* context, const std::stri
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchAll::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -277,23 +286,14 @@ Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrase::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY);
analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -345,25 +345,14 @@ Status FunctionMatchPhrasePrefix::execute_match(
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);

auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
match_query_str);
std::vector<std::string> query_tokens;
doris::segment_v2::InvertedIndexReader::get_analyse_result(
query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);

analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
if (query_tokens.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down Expand Up @@ -414,18 +403,15 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s
ColumnUInt8::Container& result) const {
RETURN_IF_ERROR(check(context, name));

doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
parser_type = inverted_index_ctx->parser_type;
}
VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
<< inverted_index_parser_type_to_string(parser_type);
<< inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);

if (match_query_str.empty()) {
VLOG_DEBUG << fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
match_query_str, inverted_index_parser_type_to_string(parser_type));
match_query_str,
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
return Status::OK();
}

Expand Down
6 changes: 6 additions & 0 deletions be/src/vec/functions/match.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const std::string MATCH_ALL_FUNCTION = "match_all";
const std::string MATCH_PHRASE_FUNCTION = "match_phrase";
const std::string MATCH_PHRASE_PREFIX_FUNCTION = "match_phrase_prefix";
const std::string MATCH_PHRASE_REGEXP_FUNCTION = "match_regexp";
const std::string MATCH_PHRASE_EDGE_FUNCTION = "match_phrase_edge";

class FunctionMatchBase : public IFunction {
public:
Expand All @@ -81,6 +82,11 @@ class FunctionMatchBase : public IFunction {

doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name() const;

void analyse_query_str_token(std::vector<std::string>* query_tokens,
InvertedIndexCtx* inverted_index_ctx,
const std::string& match_query_str,
const std::string& field_name) const;

std::vector<std::string> analyse_data_token(const std::string& column_name,
InvertedIndexCtx* inverted_index_ctx,
const ColumnString* string_col,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,18 @@ public static Map<String, String> getInvertedIndexCharFilter(Map<String, String>
return charFilterMap;
}

public static boolean getInvertedIndexParserLowercase(Map<String, String> properties) {
String lowercase = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
// default is true if not set
return lowercase != null ? Boolean.parseBoolean(lowercase) : true;
}

public static String getInvertedIndexParserStopwords(Map<String, String> properties) {
String stopwrods = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
// default is "" if not set
return stopwrods != null ? stopwrods : "";
}

public static void checkInvertedIndexParser(String indexColName, PrimitiveType colType,
Map<String, String> properties) throws AnalysisException {
String parser = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ public static void initBuiltins(FunctionSet functionSet) {
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";

private MatchPredicate() {
// use for serde only
Expand All @@ -178,23 +180,22 @@ protected MatchPredicate(MatchPredicate other) {
invertedIndexParser = other.invertedIndexParser;
invertedIndexParserMode = other.invertedIndexParserMode;
invertedIndexCharFilter = other.invertedIndexCharFilter;
invertedIndexParserLowercase = other.invertedIndexParserLowercase;
invertedIndexParserStopwords = other.invertedIndexParserStopwords;
}

/**
* use for Nereids ONLY
*/
public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
NullableMode nullableMode, String invertedIndexParser, String invertedIndexParserMode,
Map<String, String> invertedIndexCharFilter) {
NullableMode nullableMode, Index invertedIndex) {
this(op, e1, e2);
if (invertedIndexParser != null) {
this.invertedIndexParser = invertedIndexParser;
}
if (invertedIndexParserMode != null) {
this.invertedIndexParserMode = invertedIndexParserMode;
}
if (invertedIndexParserMode != null) {
this.invertedIndexCharFilter = invertedIndexCharFilter;
if (invertedIndex != null) {
this.invertedIndexParser = invertedIndex.getInvertedIndexParser();
this.invertedIndexParserMode = invertedIndex.getInvertedIndexParserMode();
this.invertedIndexCharFilter = invertedIndex.getInvertedIndexCharFilter();
this.invertedIndexParserLowercase = invertedIndex.getInvertedIndexParserLowercase();
this.invertedIndexParserStopwords = invertedIndex.getInvertedIndexParserStopwords();
}
fn = new Function(new FunctionName(op.name), Lists.newArrayList(e1.getType(), e2.getType()), retType,
false, true, nullableMode);
Expand Down Expand Up @@ -228,6 +229,8 @@ protected void toThrift(TExprNode msg) {
msg.setOpcode(op.getOpcode());
msg.match_predicate = new TMatchPredicate(invertedIndexParser, invertedIndexParserMode);
msg.match_predicate.setCharFilterMap(invertedIndexCharFilter);
msg.match_predicate.setParserLowercase(invertedIndexParserLowercase);
msg.match_predicate.setParserStopwords(invertedIndexParserStopwords);
}

@Override
Expand Down Expand Up @@ -272,6 +275,8 @@ public void analyzeImpl(Analyzer analyzer) throws AnalysisException {
invertedIndexParser = index.getInvertedIndexParser();
invertedIndexParserMode = index.getInvertedIndexParserMode();
invertedIndexCharFilter = index.getInvertedIndexCharFilter();
invertedIndexParserLowercase = index.getInvertedIndexParserLowercase();
invertedIndexParserStopwords = index.getInvertedIndexParserStopwords();
break;
}
}
Expand Down
8 changes: 8 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,14 @@ public Map<String, String> getInvertedIndexCharFilter() {
return InvertedIndexUtil.getInvertedIndexCharFilter(properties);
}

public boolean getInvertedIndexParserLowercase() {
return InvertedIndexUtil.getInvertedIndexParserLowercase(properties);
}

public String getInvertedIndexParserStopwords() {
return InvertedIndexUtil.getInvertedIndexParserStopwords(properties);
}

public boolean isLightIndexChangeSupported() {
return indexType == IndexDef.IndexType.INVERTED;
}
Expand Down
Loading
Loading