From 563a6a4b454c9e98ed1d48960550a60453f986c6 Mon Sep 17 00:00:00 2001 From: chenmingyu Date: Tue, 22 Aug 2017 09:47:25 +0800 Subject: [PATCH] add detail explanation of FE configurations. --- conf/fe.conf | 8 +- fe/src/com/baidu/palo/common/Config.java | 316 +++++++++++++++++++++-- 2 files changed, 299 insertions(+), 25 deletions(-) diff --git a/conf/fe.conf b/conf/fe.conf index 7c2b403ff3a8fd..819abb12e378cc 100644 --- a/conf/fe.conf +++ b/conf/fe.conf @@ -1,6 +1,8 @@ -## -## the uppercase properties are read and exported by bin/start_fe.sh. -## +##################################################################### +## The uppercase properties are read and exported by bin/start_fe.sh. +## To see all Frontend configurations, +## see fe/src/com/baidu/palo/common/Config.java +##################################################################### # set JAVA_HOME or set JAVA_HOME in env variables # JAVA_HOME = diff --git a/fe/src/com/baidu/palo/common/Config.java b/fe/src/com/baidu/palo/common/Config.java index 7873a84d42596d..9897e8f38417ef 100644 --- a/fe/src/com/baidu/palo/common/Config.java +++ b/fe/src/com/baidu/palo/common/Config.java @@ -17,121 +17,393 @@ public class Config extends ConfigBase { + /* + * This specifies FE log dir. FE will produces 2 log files: + * fe.log: all logs of FE process. + * fe.warn.log all WARNING and ERROR log of FE process. + */ @ConfField public static String sys_log_dir = System.getenv("PALO_HOME") + "/log"; @ConfField public static String sys_log_level = "INFO"; // INFO, WARNING, ERROR, FATAL + /* + * The roll mode of FE log files. + * TIME-DAY: roll every day. + * TIME-HOUR: roll every hour. + * SIZE-MB-nnn: roll by size. + */ @ConfField public static String sys_log_roll_mode = "SIZE-MB-1024"; // TIME-DAY, TIME-HOUR, SIZE-MB-nnn - @ConfField public static int sys_log_roll_num = 10; // the config doesn't work if rollmode is TIME-* + /* + * Maximal FE log files to be kept. + * Doesn't work if roll mode is TIME-* + */ + @ConfField public static int sys_log_roll_num = 10; - // verbose modules. VERBOSE level is implemented by log4j DEBUG level. + /* + * Verbose modules. VERBOSE level is implemented by log4j DEBUG level. + * eg: + * sys_log_verbose_modules = com.baidu.palo.catalog + * This will only print verbose log of files in package com.baidu.palo.catalog and all its sub packages. + */ @ConfField public static String[] sys_log_verbose_modules = {}; + /* + * This specifies FE audit log dir. + * Audit log fe.audit.log contains all SQL queries with related infos such as user, host, cost, status, etc. + */ @ConfField public static String audit_log_dir = System.getenv("PALO_HOME") + "/log"; + /* + * Slow query contains all queries which cost exceed *qe_slow_log_ms* + */ @ConfField public static String[] audit_log_modules = {"slow_query", "query"}; @ConfField public static String audit_log_roll_mode = "TIME-DAY"; // TIME-DAY, TIME-HOUR, SIZE-MB-nnn - @ConfField public static int audit_log_roll_num = 10; // the config doesn't work if rollmode is TIME-* + @ConfField + public static int audit_log_roll_num = 10; // Doesn't work if roll mode is TIME-* + /* + * Labels of finished or cancelled load jobs will be removed after *label_keep_max_second* + * The removed labels can be reused. + * Set a short time will lower the FE memory usage. + * (Because all load jobs' info is kept in memoery before being removed) + */ @ConfField public static int label_keep_max_second = 7 * 24 * 3600; // 7 days + /* + * Load label cleaner will run every *label_clean_interval_second* to clean the outdated jobs. + */ @ConfField public static int label_clean_interval_second = 4 * 3600; // 4 hours + /* + * If a load job stay in QUORUM_FINISHED state longer than *quorum_load_job_max_second*, + * a clone job will be triggered to help finishing this load job. + */ @ConfField public static int quorum_load_job_max_second = 24 * 3600; // 1 days // Configurations for meta data durability + /* + * Palo meta data will be saved here. + * The storage of this dir is highly recommended as to be: + * 1. High write performance (SSD) + * 2. Safe (RAID) + */ @ConfField public static String meta_dir = System.getenv("PALO_HOME") + "/palo-meta"; - @ConfField public static String edit_log_type = "BDB"; // BDB, LOCAL - @ConfField public static int edit_log_port = 9010; // Only used when edit_log_type = "BDB + /* + * Edit log type. + * BDB: write log to bdbje + * LOCAL: deprecated. + */ + @ConfField + public static String edit_log_type = "BDB"; + /* + * bdbje port + */ + @ConfField + public static int edit_log_port = 9010; + /* + * Master FE will save image every *edit_log_roll_num* meta journals. + */ @ConfField public static int edit_log_roll_num = 100000; + /* + * Non-master FE will stop offering service + * if meta data delay gap exceeds *meta_delay_toleration_second* + */ @ConfField public static int meta_delay_toleration_second = 300; // 5 min + /* + * Master FE sync policy of bdbje. + * more info, see: http://docs.oracle.com/cd/E17277_02/html/java/com/sleepycat/je/Durability.SyncPolicy.html + */ @ConfField public static String master_sync_policy = "WRITE_NO_SYNC"; // SYNC, NO_SYNC, WRITE_NO_SYNC + /* + * Follower FE sync policy of bdbje. + */ @ConfField public static String replica_sync_policy = "WRITE_NO_SYNC"; // SYNC, NO_SYNC, WRITE_NO_SYNC + /* + * Replica ack policy of bdbje. + * more info, see: http://docs.oracle.com/cd/E17277_02/html/java/com/sleepycat/je/Durability.ReplicaAckPolicy.html + */ @ConfField public static String replica_ack_policy = "SIMPLE_MAJORITY"; // ALL, NONE, SIMPLE_MAJORITY - // kudu master address + /* + * Kudu is currently not supported. + */ @ConfField public static String kudu_master_addresses = "127.0.0.1:8030"; @ConfField public static int kudu_client_timeout_ms = 500; /* - * true means reset replication group. If all the electable nodes can not start, we can - * copy the meta data to another node and set this item to true to recover the metadata. - * In this scenario, we can get the newest image file contains all the meta data, then - * use the image file to restart the failed cluster. + * If true, FE will reset bdbje replication group(that is, to remove all electable nodes info) + * and is supposed to start as Master. + * If all the electable nodes can not start, we can copy the meta data + * to another node and set this config to true to try to restart the FE. */ @ConfField public static String metadata_failure_recovery = "false"; /* - * false means non-master node need to check its own journal is out of date or not in every replay loop. - * true means to ignore this meta check. - * this should only be set by rest api and only be set when master is truly out of services. + * If true, non-master FE will ignore the meta data delay gap between Master FE and its self, + * even if the metadata delay gap exceeds *meta_delay_toleration_second*. + * Non-master FE will still offer read service. + * + * This is helpful when you try to stop the Master FE for a relatively long time for some reason, + * but still wish the non-master FE can offer read service. */ @ConfField public static boolean ignore_meta_check = false; + /* + * Fe http port + * Currently, all FEs' http port must be same. + */ @ConfField public static int http_port = 8030; + /* + * FE thrift server port + */ @ConfField public static int rpc_port = 9020; + /* + * FE mysql server port + */ @ConfField public static int query_port = 9030; - // Config cluster name and id + /* + * Cluster name will be shown as the title of web page + */ @ConfField public static String cluster_name = "Baidu Palo"; + /* + * node(FE or BE) will be considered belonging to the same Palo cluster if they have same cluster id. + * Cluster id is usually a random integer generated when master FE start at first time. + * You can also sepecify one. + */ @ConfField public static int cluster_id = -1; // Configurations for load, clone, create table, alter table etc. We will rarely change them + /* + * Maximal waiting time for creating a single replica. + * eg. + * if you create a table with #m tablets and #n replicas for each tablet, + * the create table request will run at most (m * n * tablet_create_timeout_second) before timeout. + */ @ConfField public static int tablet_create_timeout_second = 1; - @ConfField public static int table_create_default_keys_num = 5; - @ConfField public static int table_create_default_distribute_num = 5; + + /* + * Load checker's running interval. + * A load job will transfer its state from PENDING to ETL to LOADING to FINISHED. + * So a load job will cost at least 3 check intervals to finish. + */ @ConfField public static int load_checker_interval_second = 5; + + /* + * Concurrency of HIGH priority pending load jobs. + * Load job priority is defined as HIGH or NORMAL. + * All mini batch load jobs are HIGH priority, other types of load jobs are NORMAL priority. + * Priority is set to avoid that a slow load job occupies a thread for a long time. + * This is just a internal optimized scheduling policy. + * Currently, you can not specified the job priority manually, + * and do not change this if you know what you are doing. + */ @ConfField public static int load_pending_thread_num_high_priority = 3; + /* + * Concurrency of NORMAL priority pending load jobs. + * Do not change this if you know what you are doing. + */ @ConfField public static int load_pending_thread_num_normal_priority = 10; + /* + * Concurrency of HIGH priority etl load jobs. + * Do not change this if you know what you are doing. + */ @ConfField public static int load_etl_thread_num_high_priority = 3; + /* + * Concurrency of NORMAL priority etl load jobs. + * Do not change this if you know what you are doing. + */ @ConfField public static int load_etl_thread_num_normal_priority = 10; + /* + * Not available. + */ @ConfField public static int load_input_size_limit_gb = 0; // GB, 0 is no limit + /* + * Not available. + */ @ConfField public static int load_running_job_num_limit = 0; // 0 is no limit + /* + * Default pull load timeout + */ + @ConfField + public static int pull_load_task_default_timeout_second = 3600; // 1hour + + /* + * Same meaning as *tablet_create_timeout_second*, but used when delete a tablet. + */ @ConfField public static int tablet_delete_timeout_second = 2; + /* + * Clone checker's running interval. + */ @ConfField public static int clone_checker_interval_second = 300; + /* + * Default timeout of a single clone job. Set long enough to fit your replica size. + * The larger the replica data size is, the more time is will cost to finish clone. + */ @ConfField public static int clone_job_timeout_second = 7200; // 2h + /* + * Concurrency of LOW priority clone jobs. + * Concurrency of High priority clone jobs is currently unlimit. + */ @ConfField public static int clone_max_job_num = 100; + /* + * LOW priority clone job's delay trigger time. + * A clone job contains a tablet which need to be cloned(recovery or migration). + * If the priority is LOW, it will be delayed *clone_low_priority_delay_second* + * after the job creation and then be executed. + * This is to avoid a large number of clone jobs running at same time only because a host is down for a short time. + * + * NOTICE that this config(and *clone_normal_priority_delay_second* as well) + * will not work if it's smaller then *clone_checker_interval_second* + */ @ConfField public static int clone_low_priority_delay_second = 600; + /* + * NORMAL priority clone job's delay trigger time. + */ @ConfField public static int clone_normal_priority_delay_second = 300; + /* + * HIGH priority clone job's delay trigger time. + */ @ConfField public static int clone_high_priority_delay_second = 0; + /* + * Balance threshold of data size in BE. + * The balance algorithm is: + * 1. Calculate the average used capacity(AUC) of the entire cluster. (total data size / total backends num) + * 2. The high water level is (AUC * (1 + clone_capacity_balance_threshold)) + * 3. The low water level is (AUC * (1 - clone_capacity_balance_threshold)) + * The Clone checker will try to move replica from high water level BE to low water level BE. + */ @ConfField public static double clone_capacity_balance_threshold = 0.2; + /* + * Balance threshold of num of replicas in Backends. + */ @ConfField public static double clone_distribution_balance_threshold = 0.2; + /* + * Maximal timeout of ALTER TABEL request. Set long enough to fit your table data size. + */ @ConfField public static int alter_table_timeout_second = 86400; // 1day + /* + * After ALTER TABEL finished, deletion of the old schema replica is delayed, + * in case there are still some queries using the old schema replica. + */ @ConfField public static int alter_delete_base_delay_second = 600; // 10min + /* + * If a backend is down for *max_backend_down_time_second*, a BACKEND_DOWN event will be triggered. + * Do not set this if you know what you are doing. + */ @ConfField public static int max_backend_down_time_second = 3600; // 1h + /* + * When create a table(or partition), you can specfied its storage media(HDD or SSD). + * If set to SSD, this specifies the default duration that tablets will stay on SSD. + * After that, tablets will be moved to HDD automatically. + * You can set storage cooldown time in LOAD stmt. + */ @ConfField public static long storage_cooldown_second = 30 * 24 * 3600L; // 30 days + /* + * After dropping database(table/partition), you can recover it by using RECOVER stmt. + * And this specifies the maximal data retention time. After time, the data will be deleted permanently. + */ @ConfField public static long catalog_trash_expire_second = 86400L; // 1day - @ConfField public static int pull_load_task_default_timeout_second = 3600; // 1hour + /* + * Maximal bytes that a single broker scanner will read. + * Do not set this if you know what you are doing. + */ @ConfField public static long min_bytes_per_broker_scanner = 67108864L; // 64MB + /* + * Maximal concurrency of broker scanners. + * Do not set this if you know what you are doing. + */ @ConfField public static int max_broker_concurrency = 10; + + /* + * Export checker's running interval. + */ @ConfField public static int export_checker_interval_second = 5; + /* + * Concurrency of pending export jobs. + */ @ConfField public static int export_pending_thread_num = 5; + /* + * Num of thread to handle export jobs. + */ @ConfField public static int export_exporting_thread_num = 10; + /* + * Limitation of the concurrency of running export jobs. + * Default is no limit. + */ @ConfField public static int export_running_job_num_limit = 0; // 0 is no limit - @ConfField public static int export_task_default_timeout_second = 24 * 3600; + /* + * Default timeout of export jobs. + */ + @ConfField public static int export_task_default_timeout_second = 24 * 3600; // 24h + /* + * Concurrency of exporting tablets. + */ @ConfField public static int export_parallel_tablet_num = 5; + /* + * Labels of finished or cancelled export jobs will be removed after *label_keep_max_second*. + * The removed labels can be reused. + */ @ConfField public static int export_keep_max_second = 7 * 24 * 3600; // 7 days // Configurations for consistency check + /* + * Consistency checker will run from *consistency_check_start_time* to *consistency_check_end_time*. + * Default is from 23:00 to 04:00 + */ @ConfField public static String consistency_check_start_time = "23"; @ConfField public static String consistency_check_end_time = "4"; + /* + * Default timeout of a single consistency check task. Set long enough to fit your tablet size. + */ @ConfField public static long check_consistency_default_timeout_second = 600; // 10 min // Configurations for query engine + /* + * Maximal number of connections per FE. + */ @ConfField public static int qe_max_connection = 1024; + /* + * Maximal number of connections per user, per FE. + */ @ConfField public static int max_conn_per_user = 100; + /* + * Default query timeout. + */ @ConfField public static int qe_query_timeout_second = 300; + /* + * If the response time of a query exceed this threshold, it will be recored in audit log as slow_query. + */ @ConfField public static long qe_slow_log_ms = 5000; - @ConfField public static int blacklist_backends_max_times = 6; + /* + * The interval of user resource publishing. + * User resource contains cgroup configurations of a user. + */ @ConfField public static int meta_resource_publish_interval_ms = 60000; // 1m + /* + * The default user resource publishing timeout. + */ @ConfField public static int meta_publish_timeout_ms = 1000; @ConfField public static boolean proxy_auth_enable = false; @ConfField public static String proxy_auth_magic_prefix = "x@8"; - // Limits on the number of expr children and the depth of an expr tree. - // exceed this limit may cause long analysis time while holding db read lock. + /* + * Limit on the number of expr children of an expr tree. + * Exceed this limit may cause long analysis time while holding database read lock. + * Do not set this if you know what you are doing. + */ @ConfField public static int expr_children_limit = 10000; - // The expr depth limit is mostly due to our recursive implementation of toSql(). + /* + * Limit on the depth of an expr tree. + * Exceed this limit may cause long analysis time while holding db read lock. + * Do not set this if you know what you are doing. + */ @ConfField public static int expr_depth_limit = 3000; // Configurations for backup and restore + /* + * Plugins' path for BACKUP and RESTORE operations. Currently deprecated. + */ @ConfField public static String backup_plugin_path = "/tools/trans_file_tool/trans_files.sh"; // Configurations for hadoop dpp + /* + * The following configurations are not available. + */ @ConfField public static String dpp_hadoop_client_path = "/lib/hadoop-client/hadoop/bin/hadoop"; @ConfField public static long dpp_bytes_per_reduce = 100 * 1024 * 1024L; // 100M @ConfField public static String dpp_default_cluster = "palo-dpp";