From 519d20cfe9cabe1611739e1a0ceff42ad62b261d Mon Sep 17 00:00:00 2001 From: Zijie Date: Wed, 10 Apr 2024 18:02:43 +0800 Subject: [PATCH] Add GUC 'gp_random_insert_segments' to control the segments used for random distributed table insertion Introduces the 'gp_random_insert_segments' GUC to reduce the generation of excessive fragmented files during the insertion of small amounts of data into clusters with a large number of segments (e.g., 1000 records into 100 segments). Fragmented data insertion can significantly degrade performance, especially when using append-optimized or cloud-based storage. By introducing the 'gp_random_insert_segments' GUC, users can limit the number of segments used for data insertion in randomly distributed tables, which can significantly reduce fragmented files. --- src/backend/cdb/cdbpath.c | 9 +++++++++ src/backend/commands/copyfrom.c | 6 ++++++ src/backend/utils/misc/guc_gp.c | 11 +++++++++++ src/include/utils/guc.h | 1 + src/include/utils/sync_guc_name.h | 1 + 5 files changed, 28 insertions(+) diff --git a/src/backend/cdb/cdbpath.c b/src/backend/cdb/cdbpath.c index 4fb64912cc0..7c8547e357d 100644 --- a/src/backend/cdb/cdbpath.c +++ b/src/backend/cdb/cdbpath.c @@ -2611,6 +2611,15 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy, } else elog(ERROR, "unrecognized policy type %u", policyType); + + if (CdbPathLocus_IsStrewn(subpath->locus) && + gp_random_insert_segments > 0 && + gp_random_insert_segments < CdbPathLocus_NumSegments(subpath->locus)) + { + /* Select limited random segments for data insertion. */ + subpath->locus.numsegments = gp_random_insert_segments; + } + return subpath; } diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index acb01d91ec6..9e02abf749f 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -3324,6 +3324,12 @@ GetTargetSeg(GpDistributionData *distData, TupleTableSlot *slot) target_seg = cdbhashreduce(cdbHash); /* hash result segment */ } + else if (gp_random_insert_segments > 0 && + gp_random_insert_segments < policy->numsegments) + { + /* Select limited random segments for data insertion. */ + target_seg = cdbhashrandomseg(gp_random_insert_segments); + } else { /* diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index f7a1dd2b5cb..9a4a85feb5e 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -146,6 +146,7 @@ int gp_appendonly_compaction_threshold = 0; bool enable_parallel = false; int gp_appendonly_insert_files = 0; int gp_appendonly_insert_files_tuples_range = 0; +int gp_random_insert_segments = 0; bool gp_heap_require_relhasoids_match = true; bool gp_local_distributed_cache_stats = false; bool debug_xlog_record_read = false; @@ -3215,6 +3216,16 @@ struct config_int ConfigureNamesInt_gp[] = NULL, NULL, NULL }, + { + {"gp_random_insert_segments", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Use limited number of segments for random distributed table insertion."), + NULL + }, + &gp_random_insert_segments, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + { {"gp_workfile_max_entries", PGC_POSTMASTER, RESOURCES, gettext_noop("Sets the maximum number of entries that can be stored in the workfile directory"), diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 90f054e7f33..d49f00a5bab 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -300,6 +300,7 @@ extern bool gp_appendonly_compaction; extern bool enable_parallel; extern int gp_appendonly_insert_files; extern int gp_appendonly_insert_files_tuples_range; +extern int gp_random_insert_segments; extern bool enable_answer_query_using_materialized_views; extern bool enable_offload_entry_to_qe; /* diff --git a/src/include/utils/sync_guc_name.h b/src/include/utils/sync_guc_name.h index aefc91e8716..14c5125a2d2 100644 --- a/src/include/utils/sync_guc_name.h +++ b/src/include/utils/sync_guc_name.h @@ -152,3 +152,4 @@ "gp_resgroup_debug_wait_queue", "gp_appendonly_insert_files", "gp_appendonly_insert_files_tuples_range", + "gp_random_insert_segments",