From b43c38293e62163ac6371ca671874d70aa432e3d Mon Sep 17 00:00:00 2001 From: Nadav Shatz Date: Tue, 18 Nov 2025 13:19:02 +0200 Subject: [PATCH] Fix multiple issues in external replication delay feature This commit addresses several critical bugs and code quality issues in the external replication delay command implementation: Memory Safety: - Initialize command pointer to NULL to prevent undefined behavior when freeing uninitialized memory Security: - Add hostname validation to detect shell metacharacters - Warn about potentially dangerous characters in hostnames - Changed warning level from WARNING to LOG to avoid excessive alerts Signal Handler Race Conditions: - Reorder signal(SIGALRM, SIG_DFL) before alarm(0) in all error paths - Prevents race condition where alarm could trigger after handler removal - Fixes: popen failure, fgets failure, and exception handler paths Primary Node Race Condition: - Capture REAL_PRIMARY_NODE_ID into local variable at function start - Use captured value consistently throughout function execution - Prevents inconsistencies if primary changes during execution Output Parsing Robustness: - Enhanced validation for external command output - Provide specific warnings for: no output, too few values, too many values - Improved error messages with helpful hints for operators Code Quality: - Remove unused replica_idx variable to eliminate compiler warnings - Improve code readability and maintainability All changes have been tested in a Docker-based integration environment with PostgreSQL primary/replica setup and confirmed working correctly. --- src/streaming_replication/pool_worker_child.c | 68 +++++++++++++------ 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index 81dc82922..1ae290e28 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -694,11 +694,10 @@ static void check_replication_time_lag_with_cmd(void) { FILE *fp; - char *command; + char *command = NULL; char *line; char *token; char *saveptr; - int replica_idx; int num_replicas; double delay_ms; uint64 delay; @@ -724,6 +723,9 @@ check_replication_time_lag_with_cmd(void) return; } + /* Capture primary node ID to avoid race conditions during execution */ + int primary_node_id = REAL_PRIMARY_NODE_ID; + if (!pool_config->replication_delay_source_cmd || strlen(pool_config->replication_delay_source_cmd) == 0) { @@ -757,7 +759,7 @@ check_replication_time_lag_with_cmd(void) /* Calculate total command length including space-separated replica identifiers */ for (int i = 0; i < NUM_BACKENDS; i++) { - if (i == REAL_PRIMARY_NODE_ID) + if (i == primary_node_id) continue; /* Skip primary node */ char *ident = build_instance_identifier_for_node(i); @@ -771,7 +773,7 @@ check_replication_time_lag_with_cmd(void) /* Append replica identifiers */ for (int i = 0; i < NUM_BACKENDS; i++) { - if (i == REAL_PRIMARY_NODE_ID) + if (i == primary_node_id) continue; /* Skip primary node */ char *ident = build_instance_identifier_for_node(i); @@ -791,8 +793,9 @@ check_replication_time_lag_with_cmd(void) fp = popen(command, "r"); if (fp == NULL) { - alarm(0); /* Cancel alarm */ + /* Cancel timeout: restore signal handler first to avoid race condition */ signal(SIGALRM, SIG_DFL); + alarm(0); ereport(ERROR, (errmsg("failed to execute replication delay command: %s", command), errdetail("popen failed: %m"))); @@ -802,8 +805,9 @@ check_replication_time_lag_with_cmd(void) { int pclose_result = pclose(fp); fp = NULL; - alarm(0); /* Cancel alarm */ + /* Cancel timeout: restore signal handler first to avoid race condition */ signal(SIGALRM, SIG_DFL); + alarm(0); if (command_timeout_occurred) { @@ -820,8 +824,9 @@ check_replication_time_lag_with_cmd(void) } } - alarm(0); /* Cancel alarm */ + /* Cancel timeout: restore signal handler first to avoid race condition */ signal(SIGALRM, SIG_DFL); + alarm(0); /* Check if output was truncated */ if (strlen(line) == MAX_CMD_OUTPUT - 1 && line[MAX_CMD_OUTPUT - 2] != '\n') @@ -836,7 +841,7 @@ check_replication_time_lag_with_cmd(void) command = NULL; /* Set primary node delay to 0 */ - bkinfo = pool_get_node_info(REAL_PRIMARY_NODE_ID); + bkinfo = pool_get_node_info(primary_node_id); bkinfo->standby_delay = 0; bkinfo->standby_delay_by_time = true; @@ -853,28 +858,40 @@ check_replication_time_lag_with_cmd(void) } pfree(line_copy); - if (token_count != num_replicas) + /* Validate output format */ + if (token_count == 0) + { + ereport(WARNING, + (errmsg("replication delay command produced no output"), + errhint("Command should output delay values separated by spaces, one per replica node"))); + } + else if (token_count < num_replicas) { ereport(WARNING, (errmsg("replication delay command returned %d values, expected %d (one per replica, excluding primary)", token_count, num_replicas), - errhint("Command should output one delay value per replica node"))); + errhint("Command should output one delay value per replica node. Missing values will be treated as 0."))); + } + else if (token_count > num_replicas) + { + ereport(WARNING, + (errmsg("replication delay command returned %d values, expected %d (one per replica, excluding primary)", + token_count, num_replicas), + errhint("Command should output exactly one delay value per replica node. Extra values will be ignored."))); } - /* Parse the output - one delay value per replica in order */ - token = strtok_r(line, " \t\n", &saveptr); - replica_idx = 0; + /* Parse the output - one delay value per replica in order */ + token = strtok_r(line, " \t\n", &saveptr); for (int i = 0; i < NUM_BACKENDS && token != NULL; i++) { - if (i == REAL_PRIMARY_NODE_ID) + if (i == primary_node_id) continue; /* Skip primary - it's not in the output */ if (!VALID_BACKEND(i)) { /* Skip invalid backend but consume token */ token = strtok_r(NULL, " \t\n", &saveptr); - replica_idx++; continue; } @@ -900,7 +917,6 @@ check_replication_time_lag_with_cmd(void) i))); /* Keep previous delay value, don't trigger failover */ token = strtok_r(NULL, " \t\n", &saveptr); - replica_idx++; continue; } @@ -933,19 +949,19 @@ check_replication_time_lag_with_cmd(void) { ereport(LOG, (errmsg("Replication of node: %d is behind %.3f second(s) from the primary server (node: %d) [external command]", - i, delay_ms / 1000, REAL_PRIMARY_NODE_ID))); + i, delay_ms / 1000, primary_node_id))); } token = strtok_r(NULL, " \t\n", &saveptr); - replica_idx++; } } PG_CATCH(); { /* Cleanup in case of error */ - alarm(0); /* Cancel any pending alarm */ + /* Cancel timeout: restore signal handler first to avoid race condition */ signal(SIGALRM, SIG_DFL); + alarm(0); if (fp) { pclose(fp); @@ -976,6 +992,7 @@ static char * build_instance_identifier_for_node(int node_id) { BackendInfo *bi = pool_get_node_info(node_id); + const char *hostname; if (!bi || bi->backend_hostname[0] == '\0' || bi->backend_port <= 0) { @@ -983,8 +1000,19 @@ build_instance_identifier_for_node(int node_id) return psprintf("unknown_node_%d", node_id); } + hostname = bi->backend_hostname; + + /* Validate hostname for security - check for shell metacharacters */ + if (strpbrk(hostname, "$`\\|;&<>()[]{}\"\'\n\r\t") != NULL) + { + ereport(LOG, + (errmsg("hostname for node %d contains potentially dangerous characters: %s", + node_id, hostname), + errhint("Hostnames with shell metacharacters may pose security risks when used with external commands. Consider using IP addresses or sanitized hostnames."))); + } + /* Use hostname:port format */ - return psprintf("%s:%d", bi->backend_hostname, bi->backend_port); + return psprintf("%s:%d", hostname, bi->backend_port); } static void -- 2.52.0