Skip to content

Commit fb17f20

Browse files
authored
[fix](http) fix http url with incorrect character notation (apache#38420) (apache#39535)
## Proposed changes pick from master apache#38420
1 parent 830f250 commit fb17f20

File tree

6 files changed

+179
-34
lines changed

6 files changed

+179
-34
lines changed

be/src/http/http_client.cpp

+59-1
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,11 @@ Status HttpClient::init(const std::string& url, bool set_fail_on_error) {
131131
LOG(WARNING) << "fail to set CURLOPT_WRITEDATA, msg=" << _to_errmsg(code);
132132
return Status::InternalError("fail to set CURLOPT_WRITEDATA");
133133
}
134+
135+
std::string escaped_url;
136+
RETURN_IF_ERROR(_escape_url(url, &escaped_url));
134137
// set url
135-
code = curl_easy_setopt(_curl, CURLOPT_URL, url.c_str());
138+
code = curl_easy_setopt(_curl, CURLOPT_URL, escaped_url.c_str());
136139
if (code != CURLE_OK) {
137140
LOG(WARNING) << "failed to set CURLOPT_URL, errmsg=" << _to_errmsg(code);
138141
return Status::InternalError("fail to set CURLOPT_URL");
@@ -290,4 +293,59 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time,
290293
return status;
291294
}
292295

296+
// http://example.com/page?param1=value1&param2=value+with+spaces#section
297+
Status HttpClient::_escape_url(const std::string& url, std::string* escaped_url) {
298+
size_t query_pos = url.find('?');
299+
if (query_pos == std::string::npos) {
300+
*escaped_url = url;
301+
return Status::OK();
302+
}
303+
size_t fragment_pos = url.find('#');
304+
std::string query;
305+
std::string fragment;
306+
307+
if (fragment_pos == std::string::npos) {
308+
query = url.substr(query_pos + 1, url.length() - query_pos - 1);
309+
} else {
310+
query = url.substr(query_pos + 1, fragment_pos - query_pos - 1);
311+
fragment = url.substr(fragment_pos, url.length() - fragment_pos);
312+
}
313+
314+
std::string encoded_query;
315+
size_t ampersand_pos = query.find('&');
316+
size_t equal_pos;
317+
318+
if (ampersand_pos == std::string::npos) {
319+
ampersand_pos = query.length();
320+
}
321+
322+
while (true) {
323+
equal_pos = query.find('=');
324+
if (equal_pos != std::string::npos) {
325+
std::string key = query.substr(0, equal_pos);
326+
std::string value = query.substr(equal_pos + 1, ampersand_pos - equal_pos - 1);
327+
328+
auto encoded_value = std::unique_ptr<char, decltype(&curl_free)>(
329+
curl_easy_escape(_curl, value.c_str(), value.length()), &curl_free);
330+
if (encoded_value) {
331+
encoded_query += key + "=" + std::string(encoded_value.get());
332+
} else {
333+
return Status::InternalError("escape url failed, url={}", url);
334+
}
335+
} else {
336+
encoded_query += query.substr(0, ampersand_pos);
337+
}
338+
339+
if (ampersand_pos == query.length() || ampersand_pos == std::string::npos) {
340+
break;
341+
}
342+
343+
encoded_query += "&";
344+
query = query.substr(ampersand_pos + 1);
345+
ampersand_pos = query.find('&');
346+
}
347+
*escaped_url = url.substr(0, query_pos + 1) + encoded_query + fragment;
348+
return Status::OK();
349+
}
350+
293351
} // namespace doris

be/src/http/http_client.h

+9
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,15 @@ class HttpClient {
146146

147147
size_t on_response_data(const void* data, size_t length);
148148

149+
// The file name of the variant column with the inverted index contains %
150+
// such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
151+
// {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
152+
// We need to handle %, otherwise it will cause an HTTP 404 error.
153+
// Because the percent ("%") character serves as the indicator for percent-encoded octets,
154+
// it must be percent-encoded as "%25" for that octet to be used as data within a URI.
155+
// https://datatracker.ietf.org/doc/html/rfc3986
156+
Status _escape_url(const std::string& url, std::string* escaped_url);
157+
149158
private:
150159
const char* _to_errmsg(CURLcode code);
151160

be/src/olap/single_replica_compaction.cpp

+1-14
Original file line numberDiff line numberDiff line change
@@ -411,20 +411,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir,
411411
return Status::InternalError("single compaction init curl failed");
412412
}
413413
for (auto& file_name : file_name_list) {
414-
// The file name of the variant column with the inverted index contains %
415-
// such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
416-
// {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
417-
// We need to handle %, otherwise it will cause an HTTP 404 error.
418-
// Because the percent ("%") character serves as the indicator for percent-encoded octets,
419-
// it must be percent-encoded as "%25" for that octet to be used as data within a URI.
420-
// https://datatracker.ietf.org/doc/html/rfc3986
421-
auto output = std::unique_ptr<char, decltype(&curl_free)>(
422-
curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free);
423-
if (!output) {
424-
return Status::InternalError("escape file name failed, file name={}", file_name);
425-
}
426-
std::string encoded_filename(output.get());
427-
auto remote_file_url = remote_url_prefix + encoded_filename;
414+
auto remote_file_url = remote_url_prefix + file_name;
428415

429416
// get file length
430417
uint64_t file_size = 0;

be/src/olap/task/engine_clone_task.cpp

+1-19
Original file line numberDiff line numberDiff line change
@@ -525,26 +525,8 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re
525525
uint64_t total_file_size = 0;
526526
MonotonicStopWatch watch;
527527
watch.start();
528-
auto curl = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>(curl_easy_init(),
529-
&curl_easy_cleanup);
530-
if (!curl) {
531-
return Status::InternalError("engine clone task init curl failed");
532-
}
533528
for (auto& file_name : file_name_list) {
534-
// The file name of the variant column with the inverted index contains %
535-
// such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
536-
// {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
537-
// We need to handle %, otherwise it will cause an HTTP 404 error.
538-
// Because the percent ("%") character serves as the indicator for percent-encoded octets,
539-
// it must be percent-encoded as "%25" for that octet to be used as data within a URI.
540-
// https://datatracker.ietf.org/doc/html/rfc3986
541-
auto output = std::unique_ptr<char, decltype(&curl_free)>(
542-
curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free);
543-
if (!output) {
544-
return Status::InternalError("escape file name failed, file name={}", file_name);
545-
}
546-
std::string encoded_filename(output.get());
547-
auto remote_file_url = remote_url_prefix + encoded_filename;
529+
auto remote_file_url = remote_url_prefix + file_name;
548530

549531
// get file length
550532
uint64_t file_size = 0;

be/test/http/http_client_test.cpp

+42
Original file line numberDiff line numberDiff line change
@@ -299,4 +299,46 @@ TEST_F(HttpClientTest, download_file_md5) {
299299
close(fd);
300300
}
301301

302+
TEST_F(HttpClientTest, escape_url) {
303+
HttpClient client;
304+
client._curl = curl_easy_init();
305+
auto check_result = [&client](const auto& input_url, const auto& output_url) -> bool {
306+
std::string escaped_url;
307+
if (!client._escape_url(input_url, &escaped_url).ok()) {
308+
return false;
309+
}
310+
if (escaped_url != output_url) {
311+
return false;
312+
}
313+
return true;
314+
};
315+
std::string input_A = hostname + "/download_file?token=oxof&file_name=02x_0.dat";
316+
std::string output_A = hostname + "/download_file?token=oxof&file_name=02x_0.dat";
317+
ASSERT_TRUE(check_result(input_A, output_A));
318+
319+
std::string input_B = hostname + "/download_file?";
320+
std::string output_B = hostname + "/download_file?";
321+
ASSERT_TRUE(check_result(input_B, output_B));
322+
323+
std::string input_C = hostname + "/download_file";
324+
std::string output_C = hostname + "/download_file";
325+
ASSERT_TRUE(check_result(input_C, output_C));
326+
327+
std::string input_D = hostname + "/download_file?&";
328+
std::string output_D = hostname + "/download_file?&";
329+
ASSERT_TRUE(check_result(input_D, output_D));
330+
331+
std::string input_E = hostname + "/download_file?key=0x2E";
332+
std::string output_E = hostname + "/download_file?key=0x2E";
333+
ASSERT_TRUE(check_result(input_E, output_E));
334+
335+
std::string input_F = hostname + "/download_file?key=0x2E&key=%";
336+
std::string output_F = hostname + "/download_file?key=0x2E&key=%25";
337+
ASSERT_TRUE(check_result(input_F, output_F));
338+
339+
std::string input_G = hostname + "/download_file?key=0x2E&key=%2E#section";
340+
std::string output_G = hostname + "/download_file?key=0x2E&key=%252E#section";
341+
ASSERT_TRUE(check_result(input_G, output_G));
342+
}
343+
302344
} // namespace doris
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
// The cases is copied from https://github.com/trinodb/trino/tree/master
19+
// /testing/trino-product-tests/src/main/resources/sql-tests/testcases
20+
// and modified by Doris.
21+
22+
suite("test_single_replica_load", "p2") {
23+
24+
def load_json_data = {table_name, file_name ->
25+
// load the json data
26+
streamLoad {
27+
table "${table_name}"
28+
29+
// set http request header params
30+
set 'read_json_by_line', 'true'
31+
set 'format', 'json'
32+
set 'max_filter_ratio', '0.1'
33+
file file_name // import json file
34+
time 10000 // limit inflight 10s
35+
36+
// if declared a check callback, the default check condition will ignore.
37+
// So you must check all condition
38+
39+
check { result, exception, startTime, endTime ->
40+
if (exception != null) {
41+
throw exception
42+
}
43+
logger.info("Stream load ${file_name} result: ${result}".toString())
44+
def json = parseJson(result)
45+
assertEquals("success", json.Status.toLowerCase())
46+
assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
47+
}
48+
}
49+
}
50+
51+
def tableName = "test_single_replica_load"
52+
53+
sql "DROP TABLE IF EXISTS ${tableName}"
54+
sql """
55+
CREATE TABLE IF NOT EXISTS ${tableName} (
56+
k bigint,
57+
v variant,
58+
INDEX idx(v) USING INVERTED PROPERTIES("parser"="standard") COMMENT ''
59+
)
60+
DUPLICATE KEY(`k`)
61+
DISTRIBUTED BY HASH(k) BUCKETS 1
62+
properties("replication_num" = "2", "disable_auto_compaction" = "true", "inverted_index_storage_format" = "V1");
63+
"""
64+
load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
65+
load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
66+
load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
67+
}

0 commit comments

Comments
 (0)