From 9ae36ec00e140a2cc898ed92be928d9dc731f75a Mon Sep 17 00:00:00 2001 From: Nadav Shatz Date: Sun, 2 Nov 2025 13:08:33 +0200 Subject: [PATCH] feat: external replication delay injection via external command Implementation: - Add replication_delay_source_cmd configuration string option - Command receives replica node identifiers in host:port format - Primary node omitted from command arguments and output - Handle -1 for down nodes (log without triggering failover) - Command outputs one delay value (ms) per replica - Falls back to builtin queries if command not configured - Timeout handling with replication_delay_source_timeout - Use psprintf() for cleaner code - Fix indentation and remove trailing whitespace Tests: - Verify command receives replicas only (primary omitted) - Verify host:port identifier format - Test -1 handling for down nodes - Test integer and float delay values - Test validation, timeouts, and error handling - Test wrong output counts and edge cases Documentation: - Document replication_delay_source_cmd with replica-only semantics - Document -1 for down nodes - Provide examples with correct output format - Update replication_delay_source_timeout docs --- doc/src/sgml/stream-check.sgml | 68 +++ src/config/pool_config_variables.c | 21 + src/include/pool_config.h | 3 +- src/sample/pgpool.conf.sample-stream | 14 + src/streaming_replication/pool_worker_child.c | 336 ++++++++++++++- .../041.external_replication_delay/README | 59 +++ .../041.external_replication_delay/test.sh | 401 ++++++++++++++++++ .../test_parsing.sh | 54 +++ .../test_validation.sh | 323 ++++++++++++++ 9 files changed, 1274 insertions(+), 5 deletions(-) create mode 100644 src/test/regression/tests/041.external_replication_delay/README create mode 100755 src/test/regression/tests/041.external_replication_delay/test.sh create mode 100755 src/test/regression/tests/041.external_replication_delay/test_parsing.sh create mode 100755 src/test/regression/tests/041.external_replication_delay/test_validation.sh diff --git a/doc/src/sgml/stream-check.sgml b/doc/src/sgml/stream-check.sgml index d2ca3ca49..fc4799080 100644 --- a/doc/src/sgml/stream-check.sgml +++ b/doc/src/sgml/stream-check.sgml @@ -309,6 +309,74 @@ GRANT pg_monitor TO sr_check_user; + + replication_delay_source_cmd (string) + + replication_delay_source_cmd configuration parameter + + + + + Specifies an external command to retrieve replication delay information for replica nodes. + When this parameter is set and not empty, Pgpool-II uses the + external command instead of built-in database queries to obtain replication delays. + The command is executed as the Pgpool-II process user. + + + The command receives replica node identifiers as positional arguments, with the primary + node omitted. Each identifier is in the format <hostname>:<port>, + for example server1:5432 server2:5432. The order matches + Pgpool-II's backend order (excluding the primary), allowing the + script to correlate external metrics (such as from AWS CloudWatch for Aurora) to the correct nodes. + + + The command must write a single line to stdout containing one whitespace-separated delay value + per replica, in milliseconds, in the same order as the arguments. The primary node's delay is + implicitly zero and should not be included in the output. Delay values can be integers or + floating-point numbers. + + + Special value: -1 indicates a replica that is down but not yet detected + by Pgpool-II's health checks. Pgpool-II + will log this condition but rely on its own health-check logic to decide whether to trigger + failover; no failover is triggered solely by receiving -1. + + + Example for a 3-node cluster (1 primary + 2 replicas): if the command receives arguments + server1:5432 server2:5432, it should output "25.5 100" + to indicate the first replica has 25.5ms delay and the second has 100ms delay. + + + Default is empty (use built-in replication delay queries). + + + This parameter can be changed by reloading the Pgpool-II configurations. + + + + + + replication_delay_source_timeout (integer) + + replication_delay_source_timeout configuration parameter + + + + + Specifies the timeout in seconds for the external command specified by + . + If the command does not finish within the timeout, Pgpool-II + logs an error and continues using the built-in method. + + + Default is 10 seconds. Valid range is 1-3600 seconds. + + + This parameter can be changed by reloading the Pgpool-II configurations. + + + + log_standby_delay (enum) diff --git a/src/config/pool_config_variables.c b/src/config/pool_config_variables.c index 62a05979a..a35d2200f 100644 --- a/src/config/pool_config_variables.c +++ b/src/config/pool_config_variables.c @@ -980,6 +980,16 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL, NULL }, + { + {"replication_delay_source_cmd", CFGCXT_RELOAD, STREAMING_REPLICATION_CONFIG, + "External command to retrieve replication delay information.", + CONFIG_VAR_TYPE_STRING, false, 0 + }, + &g_pool_config.replication_delay_source_cmd, + "", + NULL, NULL, NULL, NULL + }, + { {"failback_command", CFGCXT_RELOAD, FAILOVER_CONFIG, "Command to execute when backend node is attached.", @@ -2334,6 +2344,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"replication_delay_source_timeout", CFGCXT_RELOAD, STREAMING_REPLICATION_CONFIG, + "Timeout for external replication delay command execution in seconds.", + CONFIG_VAR_TYPE_INT, false, 0 + }, + &g_pool_config.replication_delay_source_timeout, + 10, + 1, 3600, + NULL, NULL, NULL + }, + /* End-of-list marker */ EMPTY_CONFIG_INT }; diff --git a/src/include/pool_config.h b/src/include/pool_config.h index 9160a31c8..5bc646805 100644 --- a/src/include/pool_config.h +++ b/src/include/pool_config.h @@ -86,7 +86,6 @@ typedef enum LogStandbyDelayModes LSD_NONE } LogStandbyDelayModes; - typedef enum MemCacheMethod { SHMEM_CACHE = 1, @@ -363,6 +362,8 @@ typedef struct char *sr_check_password; /* password for sr_check_user */ char *sr_check_database; /* PostgreSQL database name for streaming * replication check */ + char *replication_delay_source_cmd; /* external command for replication delay */ + int replication_delay_source_timeout; /* timeout for external command in seconds */ char *failover_command; /* execute command when failover happens */ char *follow_primary_command; /* execute command when failover is * ended */ diff --git a/src/sample/pgpool.conf.sample-stream b/src/sample/pgpool.conf.sample-stream index ba6b923b0..34462fd59 100644 --- a/src/sample/pgpool.conf.sample-stream +++ b/src/sample/pgpool.conf.sample-stream @@ -519,6 +519,20 @@ backend_clustering_mode = streaming_replication #sr_check_database = 'postgres' # Database name for streaming replication check + +#replication_delay_source_cmd = '' + # External command to retrieve replication delay information + # If set, pgpool uses this command instead of built-in queries + # Command receives replica node identifiers (host:port) as arguments + # Primary node is omitted from arguments + # Command should output one delay value (in ms) per replica + # Use -1 to indicate a replica that is down but not yet detected + # Format: "25 100" for 2 replicas (e.g., 3-node cluster with 1 primary) + # Command runs as the pgpool process user +#replication_delay_source_timeout = 10 + # Timeout for external command execution in seconds + # Range: 1-3600 seconds (default: 10) + #delay_threshold = 0 # Threshold before not dispatching query to standby node # Unit is in bytes diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index 5bf19c37d..81dc82922 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -76,6 +76,8 @@ static volatile sig_atomic_t restart_request = 0; static void establish_persistent_connection(void); static void discard_persistent_connection(void); static void check_replication_time_lag(void); +static void check_replication_time_lag_with_cmd(void); +static char *build_instance_identifier_for_node(int node_id); static void CheckReplicationTimeLagErrorCb(void *arg); static unsigned long long int text_to_lsn(char *text); static RETSIGTYPE my_signal_handler(int sig); @@ -259,11 +261,16 @@ do_worker_child(void *params) POOL_NODE_STATUS *node_status; int i; - /* Do replication time lag checking */ - check_replication_time_lag(); + /* Do replication time lag checking */ + /* Use external command if replication_delay_source_cmd is configured */ + if (pool_config->replication_delay_source_cmd && + strlen(pool_config->replication_delay_source_cmd) > 0) + check_replication_time_lag_with_cmd(); + else + check_replication_time_lag(); - /* Check node status */ - node_status = verify_backend_node_status(slots); + /* Check node status */ + node_status = verify_backend_node_status(slots); for (i = 0; i < NUM_BACKENDS; i++) @@ -659,6 +666,327 @@ check_replication_time_lag(void) error_context_stack = callback.previous; } +#define MAX_CMD_OUTPUT 4096 +#define MAX_REASONABLE_DELAY_MS 3600000.0 /* 1 hour in milliseconds */ + +/* Global variable to track command timeout */ +static volatile sig_atomic_t command_timeout_occurred = 0; + +/* + * Signal handler for command timeou + */ +static void +command_timeout_handler(int sig) +{ + command_timeout_occurred = 1; +} + + + +/* + * Check replication time lag using external command + * + * The external command receives only replica (standby) node identifiers as arguments, + * omitting the primary node. It returns delay values in milliseconds for each replica. + * A value of -1 indicates a node that is down but not yet detected by pgpool's health checks. + */ +static void +check_replication_time_lag_with_cmd(void) +{ + FILE *fp; + char *command; + char *line; + char *token; + char *saveptr; + int replica_idx; + int num_replicas; + double delay_ms; + uint64 delay; + int token_count = 0; + BackendInfo *bkinfo; + ErrorContextCallback callback; + + if (NUM_BACKENDS <= 1) + { + /* If there's only one node, there's no point to do checking */ + return; + } + + if (REAL_PRIMARY_NODE_ID < 0) + { + /* No need to check if there's no primary */ + return; + } + + if (!VALID_BACKEND(REAL_PRIMARY_NODE_ID)) + { + /* No need to check replication delay if primary is down */ + return; + } + + if (!pool_config->replication_delay_source_cmd || + strlen(pool_config->replication_delay_source_cmd) == 0) + { + ereport(WARNING, + (errmsg("replication_delay_source_cmd is not configured"), + errhint("Set replication_delay_source_cmd to use external command mode"))); + /* Fall back to builtin method */ + check_replication_time_lag(); + return; + } + + /* Allocate buffer for command output */ + line = palloc(MAX_CMD_OUTPUT); + fp = NULL; + + /* + * Register a error context callback to throw proper context message + */ + callback.callback = CheckReplicationTimeLagErrorCb; + callback.arg = NULL; + callback.previous = error_context_stack; + error_context_stack = &callback; + + /* Execute command as current process user */ + PG_TRY(); + { + const char *base_command = pool_config->replication_delay_source_cmd; + size_t total_len = strlen(base_command) + 1; /* +1 for NUL */ + + /* Build command with replica-only arguments (omit primary) */ + /* Calculate total command length including space-separated replica identifiers */ + for (int i = 0; i < NUM_BACKENDS; i++) + { + if (i == REAL_PRIMARY_NODE_ID) + continue; /* Skip primary node */ + + char *ident = build_instance_identifier_for_node(i); + total_len += 1 /* space */ + strlen(ident); + pfree(ident); + } + + command = palloc(total_len); + strlcpy(command, base_command, total_len); + + /* Append replica identifiers */ + for (int i = 0; i < NUM_BACKENDS; i++) + { + if (i == REAL_PRIMARY_NODE_ID) + continue; /* Skip primary node */ + + char *ident = build_instance_identifier_for_node(i); + strlcat(command, " ", total_len); + strlcat(command, ident, total_len); + pfree(ident); + } + + ereport(DEBUG1, + (errmsg("executing replication delay command: %s", command))); + + /* Set up timeout for command execution */ + command_timeout_occurred = 0; + signal(SIGALRM, command_timeout_handler); + alarm(pool_config->replication_delay_source_timeout); + + fp = popen(command, "r"); + if (fp == NULL) + { + alarm(0); /* Cancel alarm */ + signal(SIGALRM, SIG_DFL); + ereport(ERROR, + (errmsg("failed to execute replication delay command: %s", command), + errdetail("popen failed: %m"))); + } + + if (fgets(line, MAX_CMD_OUTPUT, fp) == NULL) + { + int pclose_result = pclose(fp); + fp = NULL; + alarm(0); /* Cancel alarm */ + signal(SIGALRM, SIG_DFL); + + if (command_timeout_occurred) + { + ereport(ERROR, + (errmsg("replication delay command timed out after %d seconds: %s", + pool_config->replication_delay_source_timeout, command), + errhint("Consider increasing replication_delay_source_timeout or optimizing the command"))); + } + else + { + ereport(ERROR, + (errmsg("failed to read output from replication delay command: %s", command), + errdetail("command exit status: %d", pclose_result))); + } + } + + alarm(0); /* Cancel alarm */ + signal(SIGALRM, SIG_DFL); + + /* Check if output was truncated */ + if (strlen(line) == MAX_CMD_OUTPUT - 1 && line[MAX_CMD_OUTPUT - 2] != '\n') + { + ereport(WARNING, + (errmsg("replication delay command output may have been truncated"))); + } + + pclose(fp); + fp = NULL; + pfree(command); + command = NULL; + + /* Set primary node delay to 0 */ + bkinfo = pool_get_node_info(REAL_PRIMARY_NODE_ID); + bkinfo->standby_delay = 0; + bkinfo->standby_delay_by_time = true; + + /* Count expected replicas */ + num_replicas = NUM_BACKENDS - 1; /* Total nodes minus primary */ + + /* Count tokens in output for validation */ + char *line_copy = pstrdup(line); + char *temp_token = strtok(line_copy, " \t\n"); + while (temp_token != NULL) + { + token_count++; + temp_token = strtok(NULL, " \t\n"); + } + pfree(line_copy); + + if (token_count != num_replicas) + { + ereport(WARNING, + (errmsg("replication delay command returned %d values, expected %d (one per replica, excluding primary)", + token_count, num_replicas), + errhint("Command should output one delay value per replica node"))); + } + + /* Parse the output - one delay value per replica in order */ + token = strtok_r(line, " \t\n", &saveptr); + replica_idx = 0; + + for (int i = 0; i < NUM_BACKENDS && token != NULL; i++) + { + if (i == REAL_PRIMARY_NODE_ID) + continue; /* Skip primary - it's not in the output */ + + if (!VALID_BACKEND(i)) + { + /* Skip invalid backend but consume token */ + token = strtok_r(NULL, " \t\n", &saveptr); + replica_idx++; + continue; + } + + char *endptr; + delay_ms = strtod(token, &endptr); + + /* Validate the conversion */ + if (*endptr != '\0') + { + ereport(WARNING, + (errmsg("invalid delay value '%s' for node %d, treating as 0", + token, i))); + delay_ms = 0; + } + + bkinfo = pool_get_node_info(i); + + /* Handle -1 for down nodes */ + if (delay_ms == -1.0) + { + ereport(LOG, + (errmsg("node %d reported as down by external command (delay -1), relying on health check for failover decision", + i))); + /* Keep previous delay value, don't trigger failover */ + token = strtok_r(NULL, " \t\n", &saveptr); + replica_idx++; + continue; + } + + /* Validate delay value range */ + if (delay_ms < 0) + { + ereport(WARNING, + (errmsg("negative delay value %.3f for node %d (other than -1), treating as 0", + delay_ms, i))); + delay_ms = 0; + } + else if (delay_ms > MAX_REASONABLE_DELAY_MS) + { + ereport(WARNING, + (errmsg("extremely large delay value %.3f for node %d", + delay_ms, i))); + } + + /* Convert delay from milliseconds to microseconds for internal storage */ + delay = (uint64)(delay_ms * 1000); + bkinfo->standby_delay = delay; + bkinfo->standby_delay_by_time = true; + + /* Log delay if necessary */ + uint64 delay_threshold_by_time = pool_config->delay_threshold_by_time * 1000; /* threshold is in milliseconds, convert to microseconds */ + + if ((pool_config->log_standby_delay == LSD_ALWAYS && delay_ms > 0) || + (pool_config->log_standby_delay == LSD_OVER_THRESHOLD && + bkinfo->standby_delay > delay_threshold_by_time)) + { + ereport(LOG, + (errmsg("Replication of node: %d is behind %.3f second(s) from the primary server (node: %d) [external command]", + i, delay_ms / 1000, REAL_PRIMARY_NODE_ID))); + } + + token = strtok_r(NULL, " \t\n", &saveptr); + replica_idx++; + } + + } + PG_CATCH(); + { + /* Cleanup in case of error */ + alarm(0); /* Cancel any pending alarm */ + signal(SIGALRM, SIG_DFL); + if (fp) + { + pclose(fp); + fp = NULL; + } + if (line) + pfree(line); + if (command) + pfree(command); + error_context_stack = callback.previous; + PG_RE_THROW(); + } + PG_END_TRY(); + + /* Normal cleanup */ + if (line) + pfree(line); + + error_context_stack = callback.previous; +} + +/* + * build_instance_identifier_for_node + * Build an identifier string for a backend node for passing to external commands. + * Format: ":" + */ +static char * +build_instance_identifier_for_node(int node_id) +{ + BackendInfo *bi = pool_get_node_info(node_id); + + if (!bi || bi->backend_hostname[0] == '\0' || bi->backend_port <= 0) + { + /* Fallback if hostname or port is not set */ + return psprintf("unknown_node_%d", node_id); + } + + /* Use hostname:port format */ + return psprintf("%s:%d", bi->backend_hostname, bi->backend_port); +} + static void CheckReplicationTimeLagErrorCb(void *arg) { diff --git a/src/test/regression/tests/041.external_replication_delay/README b/src/test/regression/tests/041.external_replication_delay/README new file mode 100644 index 000000000..b4df5da40 --- /dev/null +++ b/src/test/regression/tests/041.external_replication_delay/README @@ -0,0 +1,59 @@ +External Replication Delay Command Test +======================================== + +This test verifies the external command replication delay source feature. + +Test Coverage: +- External command receives replica node identifiers only (primary omitted) +- Instance identifiers in host:port format +- Basic external command execution with integer and float millisecond values +- Delay threshold functionality with external commands +- Command execution as pgpool process user (no su wrapper) +- Error handling for missing/invalid commands +- Command execution failure scenarios +- Command timeout handling with configurable timeout values +- Input validation for invalid, negative (other than -1), and extremely large delay values +- Handling of -1 for down nodes (logged but no immediate failover) +- Wrong number of output values validation +- Multiple -1 values (multiple down replicas) +- Mixed scenarios (some replicas up, some down) +- Output truncation detection + +Files: +- test.sh: Main test script +- test_parsing.sh: Unit test for parsing logic +- test_validation.sh: Validation and edge case testing +- README: This documentation + +Key Changes from Original Version: +- Primary node is omitted from command arguments +- Command receives only replica identifiers +- Instance identifiers are in host:port format (not application_name) +- Output format: one delay per replica (not per all nodes) +- -1 value indicates down replica without triggering failover +- Format example: "25 100" for 2 replicas (3-node cluster = 1 primary + 2 replicas) + +The test creates temporary command scripts that output delay values in the format: +"replica1_delay replica2_delay ..." + +Where delays are in milliseconds and can be integer or floating-point values. +Special value -1 indicates a replica that is down but not yet detected by pgpool. + +Test Environment: +- Uses streaming replication mode with 3 nodes +- Node 0 is primary (omitted from command arguments) +- Nodes 1 and 2 are replicas (included in command arguments) +- Configures sr_check_period = 1 second for faster testing +- Tests various delay scenarios and threshold behaviors + +Expected Behavior: +- External commands receive replica identifiers in host:port format +- Primary node identifier is never passed to command +- Command outputs one delay value per replica +- -1 values are logged but don't trigger immediate failover +- Delay values are parsed correctly (both int and float) +- Threshold comparisons work properly +- Error conditions are handled gracefully +- Commands timeout appropriately based on configuration +- Timeout errors provide helpful messages and hints +- Tests are reliable with proper wait mechanisms instead of fixed sleeps diff --git a/src/test/regression/tests/041.external_replication_delay/test.sh b/src/test/regression/tests/041.external_replication_delay/test.sh new file mode 100755 index 000000000..f02a086b1 --- /dev/null +++ b/src/test/regression/tests/041.external_replication_delay/test.sh @@ -0,0 +1,401 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------- +# test script for external command replication delay source +# +source $TESTLIBS +TESTDIR=testdir +PG_CTL=$PGBIN/pg_ctl +PSQL="$PGBIN/psql -X " + +rm -fr $TESTDIR +mkdir $TESTDIR +cd $TESTDIR + +# create test environment +echo -n "creating test environment..." +$PGPOOL_SETUP -m s -n 3 || exit 1 +echo "done." +source ./bashrc.ports +export PGPORT=$PGPOOL_PORT + +# Create external command scripts for testing +# NOTE: Commands now only output delay values for REPLICAS (not primary) +cat > delay_cmd_static.sh << 'EOF' +#!/bin/bash +# Static delay values for replicas: node1=25ms, node2=50ms (node0 is primary, not included) +echo "25 50" +EOF +chmod +x delay_cmd_static.sh + +cat > delay_cmd_float.sh << 'EOF' +#!/bin/bash +# Float delay values for replicas: node1=25.5ms, node2=100.75ms +echo "25.5 100.75" +EOF +chmod +x delay_cmd_float.sh + +cat > delay_cmd_high.sh << 'EOF' +#!/bin/bash +# High delay values to test threshold: node1=2000ms, node2=3000ms +echo "2000 3000" +EOF +chmod +x delay_cmd_high.sh + +# ---------------------------------------------------------------------------------------- +echo "=== Test0: External command receives replica identifiers only (primary omitted) ===" +# ---------------------------------------------------------------------------------------- +# Command that captures its arguments and outputs valid delays for 2 replicas +cat > delay_cmd_args.sh << 'EOF' +#!/bin/bash +printf "%s " "$@" > args.txt +echo "25 50" +EOF +chmod +x delay_cmd_args.sh + +echo "replication_delay_source_cmd = './delay_cmd_args.sh'" >> etc/pgpool.conf +echo "sr_check_period = 1" >> etc/pgpool.conf +echo "log_min_messages = 'DEBUG1'" >> etc/pgpool.conf +# Reduce memory requirements for macOS shared memory limits +echo "num_init_children = 4" >> etc/pgpool.conf +echo "max_pool = 2" >> etc/pgpool.conf +# Disable query caching to avoid shared memory issues on macOS +echo "memory_cache_enabled = off" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +echo "Waiting for sr_check to pass args..." +for i in {1..10}; do + if [ -f args.txt ]; then + break + fi + sleep 1 +done + +if [ ! -f args.txt ]; then + echo fail: did not capture command arguments + ./shutdownall + exit 1 +fi + +ARGS_CONTENT=$(cat args.txt | sed 's/[[:space:]]*$//') +# Should receive 2 replica identifiers in host:port format (localhost:11003 localhost:11004 or server1:11003 server2:11004) +# Primary (localhost:11002 or server0:11002) should be omitted +if ! echo "$ARGS_CONTENT" | grep -qE "(server1|localhost):11003"; then + echo "fail: expected replica1:11003 in arguments, got: '$ARGS_CONTENT'" + ./shutdownall + exit 1 +fi +if ! echo "$ARGS_CONTENT" | grep -qE "(server2|localhost):11004"; then + echo "fail: expected replica2:11004 in arguments, got: '$ARGS_CONTENT'" + ./shutdownall + exit 1 +fi +if echo "$ARGS_CONTENT" | grep -qE "(server0|localhost):11002"; then + echo "fail: primary should not be in arguments, got: '$ARGS_CONTENT'" + ./shutdownall + exit 1 +fi + +echo ok: argument order correct - replicas only, primary omitted, host:port format +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test1: Basic external command with integer millisecond values ===" +# ---------------------------------------------------------------------------------------- +echo "replication_delay_source_cmd = './delay_cmd_static.sh'" >> etc/pgpool.conf +echo "sr_check_period = 1" >> etc/pgpool.conf +echo "log_standby_delay = 'always'" >> etc/pgpool.conf +echo "log_min_messages = 'DEBUG1'" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +$PSQL test </dev/null; then + echo "Command executed after ${i} seconds" + break + fi + sleep 1 +done + +$PSQL test </dev/null 2>&1 +if [ $? != 0 ];then + echo fail: external command was not executed + echo "Log contents:" + tail -20 log/pgpool.log + ./shutdownall + exit 1 +fi + +# Verify actual delay values were parsed +if ! $PSQL -t -c "SHOW POOL_NODES" test | grep -E "[0-9]+\.[0-9]+" >/dev/null; then + echo "Warning: No delay values found in POOL_NODES output" +fi + +# Check for delay log messages +grep "Replication of node.*external command" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: external command delay logging not found + ./shutdownall + exit 1 +fi + +echo ok: basic external command test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test2: External command with floating-point millisecond values ===" +# ---------------------------------------------------------------------------------------- +# Update configuration to use float command +sed -i.bak "s|delay_cmd_static.sh|delay_cmd_float.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run with float values +echo "Waiting for sr_check with float values..." +for i in {1..10}; do + if grep -q "executing replication delay command.*delay_cmd_float.sh" log/pgpool.log 2>/dev/null; then + echo "Float command executed after ${i} seconds" + break + fi + sleep 1 +done + +$PSQL test </dev/null 2>&1 +if [ $? != 0 ];then + echo fail: float command was not executed + ./shutdownall + exit 1 +fi + +echo ok: floating-point values test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test3: External command with delay threshold ===" +# ---------------------------------------------------------------------------------------- +# Update configuration to use high delay command and set threshold +sed -i.bak "s|delay_cmd_float.sh|delay_cmd_high.sh|" etc/pgpool.conf +echo "delay_threshold_by_time = 1000" >> etc/pgpool.conf +echo "backend_weight0 = 0" >> etc/pgpool.conf # Force queries to standby normally +echo "backend_weight2 = 0" >> etc/pgpool.conf # Only use node 1 as standby + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run and detect high delays +echo "Waiting for sr_check with high delay values..." +for i in {1..10}; do + if grep -q "executing replication delay command.*delay_cmd_high.sh" log/pgpool.log 2>/dev/null; then + echo "High delay command executed after ${i} seconds" + break + fi + sleep 1 +done + +$PSQL test < 1000ms threshold), query should go to primary (node 0) +grep "SELECT \* FROM t1 LIMIT 1.*DB node id: 0" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: query was not sent to primary node despite high delay + ./shutdownall + exit 1 +fi + +echo ok: delay threshold test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test4: External command execution as process user ===" +# ---------------------------------------------------------------------------------------- +# Test that command runs as the current pgpool process user +sed -i.bak "s|delay_cmd_high.sh|delay_cmd_static.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run +echo "Waiting for sr_check to run as process user..." +for i in {1..10}; do + if grep -q "executing replication delay command.*delay_cmd_static.sh" log/pgpool.log 2>/dev/null; then + echo "Command executed as process user after ${i} seconds" + break + fi + sleep 1 +done + +# Check that command was executed (without su wrapper) +grep "executing replication delay command.*delay_cmd_static.sh" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: command was not executed as process user + ./shutdownall + exit 1 +fi + +# Verify no su command was used +if grep -q "executing replication delay command.*su.*" log/pgpool.log 2>/dev/null; then + echo fail: command should not use su wrapper + ./shutdownall + exit 1 +fi + +echo ok: process user execution test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test5: Error handling - missing command ===" +# ---------------------------------------------------------------------------------------- +# Test error handling when command is not configured +sed -i.bak "s|replication_delay_source_cmd = './delay_cmd_static.sh'|replication_delay_source_cmd = ''|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# With empty command, should fall back to builtin method +# No specific error message expected - just verify it doesn't crash +sleep 3 + +echo ok: empty command test succeeded (fallback to builtin) +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test6: Error handling - command execution failure ===" +# ---------------------------------------------------------------------------------------- +# Test error handling when command fails +echo "replication_delay_source_cmd = './nonexistent_command.sh'" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run with failing command +echo "Waiting for sr_check with failing command..." +for i in {1..5}; do + if grep -q "failed to execute replication delay command" log/pgpool.log 2>/dev/null; then + echo "Command failure detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for error message about command execution failure +grep "failed to execute replication delay command" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: command execution failure not detected + ./shutdownall + exit 1 +fi + +echo ok: command failure test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test7: Command timeout handling ===" +# ---------------------------------------------------------------------------------------- +# Create a command that takes longer than the timeout +cat > delay_cmd_slow.sh << 'EOF' +#!/bin/bash +# Slow command that takes 15 seconds (longer than default 10s timeout) +sleep 15 +echo "25 50" +EOF +chmod +x delay_cmd_slow.sh + +# Set a short timeout and use the slow command +sed -i.bak "s|replication_delay_source_cmd = './nonexistent_command.sh'|replication_delay_source_cmd = './delay_cmd_slow.sh'|" etc/pgpool.conf +echo "replication_delay_source_timeout = 3" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run and timeout +echo "Waiting for command timeout..." +for i in {1..15}; do + if grep -q "replication delay command timed out" log/pgpool.log 2>/dev/null; then + echo "Command timeout detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for timeout error message +grep "replication delay command timed out after 3 seconds" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: command timeout not detected + ./shutdownall + exit 1 +fi + +echo ok: command timeout test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test8: Handling of -1 for down nodes ===" +# ---------------------------------------------------------------------------------------- +# Create a command that returns -1 for one replica +cat > delay_cmd_with_down_node.sh << 'EOF' +#!/bin/bash +# Return -1 for first replica (indicating it's down), normal value for second +echo "-1 50" +EOF +chmod +x delay_cmd_with_down_node.sh + +# Reset config +rm -f etc/pgpool.conf.bak +sed -i.bak "s|delay_cmd_slow.sh|delay_cmd_with_down_node.sh|" etc/pgpool.conf +sed -i.bak "s|replication_delay_source_timeout = 3|replication_delay_source_timeout = 10|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to process -1 value +echo "Waiting for sr_check to process -1 value..." +for i in {1..10}; do + if grep -q "node.*reported as down by external command.*delay -1" log/pgpool.log 2>/dev/null; then + echo "-1 handling detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for -1 logging message +grep "node.*reported as down by external command.*delay -1.*relying on health check" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: -1 handling message not found + ./shutdownall + exit 1 +fi + +# Verify that pgpool didn't crash or trigger failover just from -1 +if grep -q "failover" log/pgpool.log 2>/dev/null; then + echo "fail: -1 should not trigger immediate failover" + ./shutdownall + exit 1 +fi + +echo ok: -1 handling test succeeded +./shutdownall + +echo "All external replication delay tests passed!" +exit 0 diff --git a/src/test/regression/tests/041.external_replication_delay/test_parsing.sh b/src/test/regression/tests/041.external_replication_delay/test_parsing.sh new file mode 100755 index 000000000..82fdad144 --- /dev/null +++ b/src/test/regression/tests/041.external_replication_delay/test_parsing.sh @@ -0,0 +1,54 @@ +#!/bin/bash +#------------------------------------------------------------------- +# Unit test for external command parsing logic +# This tests the parsing without needing a full pgpool setup +# + +echo "=== Testing external command output parsing ===" + +# Test 1: Integer values +echo "Test 1: Integer millisecond values" +echo "0 25 50" > test_output.txt +echo "Expected: 0ms, 25ms, 50ms" +echo "Output: $(cat test_output.txt)" +echo "" + +# Test 2: Float values +echo "Test 2: Floating-point millisecond values" +echo "0 25.5 100.75" > test_output_float.txt +echo "Expected: 0ms, 25.5ms, 100.75ms" +echo "Output: $(cat test_output_float.txt)" +echo "" + +# Test 3: High precision float values +echo "Test 3: High precision values" +echo "0 0.001 999.999" > test_output_precision.txt +echo "Expected: 0ms, 0.001ms, 999.999ms" +echo "Output: $(cat test_output_precision.txt)" +echo "" + +# Test 4: Edge case - zero values +echo "Test 4: All zero values" +echo "0 0 0" > test_output_zeros.txt +echo "Expected: 0ms, 0ms, 0ms" +echo "Output: $(cat test_output_zeros.txt)" +echo "" + +# Test 5: Edge case - large values +echo "Test 5: Large delay values" +echo "0 5000 10000" > test_output_large.txt +echo "Expected: 0ms, 5000ms, 10000ms" +echo "Output: $(cat test_output_large.txt)" +echo "" + +# Test 6: Mixed integer and float values +echo "Test 6: Mixed integer and float values" +echo "0 25 50.5" > test_output_mixed.txt +echo "Expected: 0ms, 25ms, 50.5ms" +echo "Output: $(cat test_output_mixed.txt)" +echo "" + +# Cleanup +rm -f test_output_*.txt + +echo "All parsing tests completed. These outputs should be parseable by the external command feature." diff --git a/src/test/regression/tests/041.external_replication_delay/test_validation.sh b/src/test/regression/tests/041.external_replication_delay/test_validation.sh new file mode 100755 index 000000000..2cd4a7f0b --- /dev/null +++ b/src/test/regression/tests/041.external_replication_delay/test_validation.sh @@ -0,0 +1,323 @@ +#!/usr/bin/env bash +#------------------------------------------------------------------- +# test script for external command validation and edge cases +# +source $TESTLIBS +TESTDIR=testdir_validation +PG_CTL=$PGBIN/pg_ctl +PSQL="$PGBIN/psql -X " + +rm -fr $TESTDIR +mkdir $TESTDIR +cd $TESTDIR + +# create test environment +echo -n "creating test environment..." +$PGPOOL_SETUP -m s -n 3 || exit 1 +echo "done." +source ./bashrc.ports +export PGPORT=$PGPOOL_PORT + +# Create test command scripts +# NOTE: All commands output values for REPLICAS only (primary omitted) +cat > delay_cmd_validation.sh << 'EOF' +#!/bin/bash +# Test validation: output with invalid values for 2 replicas +echo "invalid_value 50.5" +EOF +chmod +x delay_cmd_validation.sh + +cat > delay_cmd_negative.sh << 'EOF' +#!/bin/bash +# Test negative values (other than -1) +echo "-25 50" +EOF +chmod +x delay_cmd_negative.sh + +cat > delay_cmd_large.sh << 'EOF' +#!/bin/bash +# Test extremely large values +echo "9999999 50" +EOF +chmod +x delay_cmd_large.sh + +cat > delay_cmd_wrong_count.sh << 'EOF' +#!/bin/bash +# Test wrong number of values (only 1 instead of 2 for 2 replicas) +echo "25" +EOF +chmod +x delay_cmd_wrong_count.sh + +# ---------------------------------------------------------------------------------------- +echo "=== Test1: Validation of invalid delay values ===" +# ---------------------------------------------------------------------------------------- +echo "replication_delay_source_cmd = './delay_cmd_validation.sh'" >> etc/pgpool.conf +echo "sr_check_period = 1" >> etc/pgpool.conf +echo "log_standby_delay = 'always'" >> etc/pgpool.conf +echo "log_min_messages = 'DEBUG1'" >> etc/pgpool.conf +# Reduce memory requirements for macOS shared memory limits +echo "num_init_children = 4" >> etc/pgpool.conf +echo "max_pool = 2" >> etc/pgpool.conf +# Disable query caching to avoid shared memory issues on macOS +echo "memory_cache_enabled = off" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +$PSQL test </dev/null; then + echo "Validation error detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for validation warning +grep "invalid delay value 'invalid_value' for node" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: validation warning not found + ./shutdownall + exit 1 +fi + +echo ok: invalid value validation test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test2: Negative delay values (other than -1) ===" +# ---------------------------------------------------------------------------------------- +sed -i.bak "s|delay_cmd_validation.sh|delay_cmd_negative.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run +echo "Waiting for negative value test..." +for i in {1..10}; do + if grep -q "negative delay value.*other than -1" log/pgpool.log 2>/dev/null; then + echo "Negative value warning detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for negative value warning +grep "negative delay value.*other than -1.*treating as 0" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: negative value warning not found + ./shutdownall + exit 1 +fi + +echo ok: negative value validation test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test3: Extremely large delay values ===" +# ---------------------------------------------------------------------------------------- +sed -i.bak "s|delay_cmd_negative.sh|delay_cmd_large.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run +echo "Waiting for large value test..." +for i in {1..10}; do + if grep -q "extremely large delay value" log/pgpool.log 2>/dev/null; then + echo "Large value warning detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for large value warning +grep "extremely large delay value.*for node" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: large value warning not found + ./shutdownall + exit 1 +fi + +echo ok: large value validation test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test4: Wrong number of output values ===" +# ---------------------------------------------------------------------------------------- +sed -i.bak "s|delay_cmd_large.sh|delay_cmd_wrong_count.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run +echo "Waiting for wrong count test..." +for i in {1..10}; do + if grep -q "returned.*values, expected.*replica" log/pgpool.log 2>/dev/null; then + echo "Wrong count warning detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for wrong count warning +grep "returned.*values, expected.*replica.*Command should output one delay value per replica" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: wrong count validation test not found + ./shutdownall + exit 1 +fi + +echo ok: wrong count validation test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test5: Multiple -1 values ===" +# ---------------------------------------------------------------------------------------- +cat > delay_cmd_multi_down.sh << 'EOF' +#!/bin/bash +# Test multiple replicas down +echo "-1 -1" +EOF +chmod +x delay_cmd_multi_down.sh + +sed -i.bak "s|delay_cmd_wrong_count.sh|delay_cmd_multi_down.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check to run +echo "Waiting for multi-down test..." +for i in {1..10}; do + if grep -q "node.*reported as down by external command" log/pgpool.log 2>/dev/null; then + echo "Multiple down nodes detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for multiple -1 handling +DOWN_COUNT=$(grep -c "node.*reported as down by external command.*delay -1" log/pgpool.log) +if [ "$DOWN_COUNT" -lt 2 ]; then + echo fail: expected 2 down node messages, found $DOWN_COUNT + ./shutdownall + exit 1 +fi + +echo ok: multiple -1 handling test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test6: Command timeout with different timeout values ===" +# ---------------------------------------------------------------------------------------- +cat > delay_cmd_timeout.sh << 'EOF' +#!/bin/bash +# Command that takes 5 seconds +sleep 5 +echo "25 50" +EOF +chmod +x delay_cmd_timeout.sh + +# Test with timeout shorter than command duration +sed -i.bak "s|delay_cmd_multi_down.sh|delay_cmd_timeout.sh|" etc/pgpool.conf +echo "replication_delay_source_timeout = 2" >> etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for timeout +echo "Waiting for timeout test (2s timeout, 5s command)..." +for i in {1..10}; do + if grep -q "replication delay command timed out after 2 seconds" log/pgpool.log 2>/dev/null; then + echo "Timeout detected after ${i} seconds" + break + fi + sleep 1 +done + +# Check for timeout message +grep "replication delay command timed out after 2 seconds" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: timeout not detected + ./shutdownall + exit 1 +fi + +echo ok: timeout test succeeded +./shutdownall + +# Test with timeout longer than command duration +sed -i.bak "s|replication_delay_source_timeout = 2|replication_delay_source_timeout = 10|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for successful execution +echo "Waiting for successful execution (10s timeout, 5s command)..." +for i in {1..15}; do + if grep -q "executing replication delay command.*delay_cmd_timeout.sh" log/pgpool.log 2>/dev/null; then + echo "Command executed successfully after ${i} seconds" + break + fi + sleep 1 +done + +# Should not timeout this time +if grep -q "replication delay command timed out" log/pgpool.log 2>/dev/null; then + echo fail: command should not have timed out with 10s timeout + ./shutdownall + exit 1 +fi + +echo ok: extended timeout test succeeded +./shutdownall + +# ---------------------------------------------------------------------------------------- +echo "=== Test7: Mix of valid delays and -1 ===" +# ---------------------------------------------------------------------------------------- +cat > delay_cmd_mixed.sh << 'EOF' +#!/bin/bash +# One replica up (25ms), one down (-1) +echo "25 -1" +EOF +chmod +x delay_cmd_mixed.sh + +sed -i.bak "s|delay_cmd_timeout.sh|delay_cmd_mixed.sh|" etc/pgpool.conf + +./startall +wait_for_pgpool_startup + +# Wait for sr_check +echo "Waiting for mixed delay test..." +for i in {1..10}; do + if grep -q "node.*reported as down by external command" log/pgpool.log 2>/dev/null; then + echo "Mixed delay handling detected after ${i} seconds" + break + fi + sleep 1 +done + +# Should log one -1 and process one normal delay +grep "node.*reported as down by external command.*delay -1" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo fail: -1 not logged + ./shutdownall + exit 1 +fi + +# Should also log the normal replica delay +grep "Replication of node.*external command" log/pgpool.log >/dev/null 2>&1 +if [ $? != 0 ];then + echo "Note: Normal replica delay logging may not be visible with log_standby_delay settings" +fi + +echo ok: mixed delay handling test succeeded +./shutdownall + +echo "All validation tests passed!" +exit 0 \ No newline at end of file -- 2.51.1