diff --git a/src/streaming_replication/pool_worker_child.c b/src/streaming_replication/pool_worker_child.c index 457d0fab0..c509ba5bc 100644 --- a/src/streaming_replication/pool_worker_child.c +++ b/src/streaming_replication/pool_worker_child.c @@ -132,7 +132,7 @@ do_worker_child(void *params) signal(SIGINT, my_signal_handler); signal(SIGHUP, reload_config_handler); signal(SIGQUIT, my_signal_handler); - signal(SIGCHLD, SIG_IGN); + signal(SIGCHLD, my_signal_handler); signal(SIGUSR1, my_signal_handler); signal(SIGUSR2, SIG_IGN); signal(SIGPIPE, SIG_IGN); @@ -262,16 +262,20 @@ do_worker_child(void *params) POOL_NODE_STATUS *node_status; int i; - /* Do replication time lag checking */ - /* Use external command if replication_delay_source_cmd is configured */ - if (pool_config->replication_delay_source_cmd && - strlen(pool_config->replication_delay_source_cmd) > 0) - check_replication_time_lag_with_cmd(); - else - check_replication_time_lag(); + /* Do replication time lag checking */ - /* Check node status */ - node_status = verify_backend_node_status(slots); + /* + * Use external command if replication_delay_source_cmd is + * configured + */ + if (pool_config->replication_delay_source_cmd && + strlen(pool_config->replication_delay_source_cmd) > 0) + check_replication_time_lag_with_cmd(); + else + check_replication_time_lag(); + + /* Check node status */ + node_status = verify_backend_node_status(slots); for (i = 0; i < NUM_BACKENDS; i++) @@ -668,7 +672,7 @@ check_replication_time_lag(void) } #define MAX_CMD_OUTPUT 4096 -#define MAX_REASONABLE_DELAY_MS 3600000.0 /* 1 hour in milliseconds */ +#define MAX_REASONABLE_DELAY_MS 3600000.0 /* 1 hour in milliseconds */ /* * Check replication time lag using external command @@ -680,23 +684,23 @@ check_replication_time_lag(void) static void check_replication_time_lag_with_cmd(void) { - char *command = NULL; - char *line; - char *token; - char *saveptr; - double delay_ms; - uint64 delay; - int token_count = 0; - BackendInfo *bkinfo; + char *command = NULL; + char *line; + char *token; + char *saveptr; + double delay_ms; + uint64 delay; + int token_count = 0; + BackendInfo *bkinfo; ErrorContextCallback callback; - int pipefd[2] = {-1, -1}; - pid_t pid = -1; - int ret; - struct timeval timeout; - fd_set readfds; - ssize_t bytes_read; - int status; - int num_replicas; + int pipefd[2] = {-1, -1}; + pid_t pid = -1; + int ret; + struct timeval timeout; + fd_set readfds; + ssize_t bytes_read; + int status; + int num_replicas; if (NUM_BACKENDS <= 1) { @@ -717,7 +721,7 @@ check_replication_time_lag_with_cmd(void) } /* Capture primary node ID to avoid race conditions during execution */ - int primary_node_id = REAL_PRIMARY_NODE_ID; + int primary_node_id = REAL_PRIMARY_NODE_ID; if (!pool_config->replication_delay_source_cmd || strlen(pool_config->replication_delay_source_cmd) == 0) @@ -746,16 +750,21 @@ check_replication_time_lag_with_cmd(void) PG_TRY(); { const char *base_command = pool_config->replication_delay_source_cmd; - size_t total_len = strlen(base_command) + 1; /* +1 for NUL */ + size_t total_len = strlen(base_command) + 1; /* +1 for NUL */ /* Build command with replica-only arguments (omit primary) */ - /* Calculate total command length including space-separated replica identifiers */ + + /* + * Calculate total command length including space-separated replica + * identifiers + */ for (int i = 0; i < NUM_BACKENDS; i++) { if (i == primary_node_id) - continue; /* Skip primary node */ + continue; /* Skip primary node */ + + char *ident = build_instance_identifier_for_node(i); - char *ident = build_instance_identifier_for_node(i); total_len += 1 /* space */ + strlen(ident); pfree(ident); } @@ -764,13 +773,14 @@ check_replication_time_lag_with_cmd(void) strlcpy(command, base_command, total_len); /* Append replica identifiers */ - size_t current_len = strlen(command); + size_t current_len = strlen(command); + for (int i = 0; i < NUM_BACKENDS; i++) { if (i == primary_node_id) - continue; /* Skip primary node */ + continue; /* Skip primary node */ - char *ident = build_instance_identifier_for_node(i); + char *ident = build_instance_identifier_for_node(i); /* Append space and identifier */ snprintf(command + current_len, total_len - current_len, " %s", ident); @@ -800,16 +810,16 @@ check_replication_time_lag_with_cmd(void) if (pid == 0) { /* Child process */ - close(pipefd[0]); /* Close read end */ + close(pipefd[0]); /* Close read end */ if (dup2(pipefd[1], STDOUT_FILENO) == -1) { fprintf(stderr, "dup2 failed: %s\n", strerror(errno)); exit(1); } - close(pipefd[1]); /* Close write end (duplicated to stdout) */ + close(pipefd[1]); /* Close write end (duplicated to stdout) */ /* Execute command using shell */ - execl("/bin/sh", "sh", "-c", command, (char *)NULL); + execl("/bin/sh", "sh", "-c", command, (char *) NULL); /* If execl fails */ fprintf(stderr, "execl failed: %s\n", strerror(errno)); @@ -817,7 +827,7 @@ check_replication_time_lag_with_cmd(void) } /* Parent process */ - close(pipefd[1]); /* Close write end */ + close(pipefd[1]); /* Close write end */ pipefd[1] = -1; /* Set up timeout for select */ @@ -832,7 +842,8 @@ check_replication_time_lag_with_cmd(void) if (ret == -1) { - int save_errno = errno; + int save_errno = errno; + kill(pid, SIGKILL); waitpid(pid, NULL, 0); pid = -1; @@ -913,11 +924,12 @@ check_replication_time_lag_with_cmd(void) bkinfo->standby_delay_by_time = true; /* Count expected replicas */ - num_replicas = NUM_BACKENDS - 1; /* Total nodes minus primary */ + num_replicas = NUM_BACKENDS - 1; /* Total nodes minus primary */ /* Count tokens in output for validation */ - char *line_copy = pstrdup(line); - char *temp_token = strtok(line_copy, " \t\n"); + char *line_copy = pstrdup(line); + char *temp_token = strtok(line_copy, " \t\n"); + while (temp_token != NULL) { token_count++; @@ -953,7 +965,7 @@ check_replication_time_lag_with_cmd(void) for (int i = 0; i < NUM_BACKENDS && token != NULL; i++) { if (i == primary_node_id) - continue; /* Skip primary - it's not in the output */ + continue; /* Skip primary - it's not in the output */ if (!VALID_BACKEND(i)) { @@ -962,7 +974,8 @@ check_replication_time_lag_with_cmd(void) continue; } - char *endptr; + char *endptr; + delay_ms = strtod(token, &endptr); /* Validate the conversion */ @@ -1002,13 +1015,18 @@ check_replication_time_lag_with_cmd(void) delay_ms, i))); } - /* Convert delay from milliseconds to microseconds for internal storage */ - delay = (uint64)(delay_ms * 1000); + /* + * Convert delay from milliseconds to microseconds for internal + * storage + */ + delay = (uint64) (delay_ms * 1000); bkinfo->standby_delay = delay; bkinfo->standby_delay_by_time = true; /* Log delay if necessary */ - uint64 delay_threshold_by_time = pool_config->delay_threshold_by_time * 1000; /* threshold is in milliseconds, convert to microseconds */ + uint64 delay_threshold_by_time = pool_config->delay_threshold_by_time * 1000; /* threshold is in + * milliseconds, convert + * to microseconds */ if ((pool_config->log_standby_delay == LSD_ALWAYS && delay_ms > 0) || (pool_config->log_standby_delay == LSD_OVER_THRESHOLD && @@ -1026,12 +1044,15 @@ check_replication_time_lag_with_cmd(void) PG_CATCH(); { /* Cleanup in case of error */ - if (pid > 0) { + if (pid > 0) + { kill(pid, SIGKILL); waitpid(pid, NULL, 0); } - if (pipefd[0] != -1) close(pipefd[0]); - if (pipefd[1] != -1) close(pipefd[1]); + if (pipefd[0] != -1) + close(pipefd[0]); + if (pipefd[1] != -1) + close(pipefd[1]); if (line) pfree(line); @@ -1137,6 +1158,9 @@ static RETSIGTYPE my_signal_handler(int sig) restart_request = 1; break; + case SIGCHLD: + break; + default: exit(1); break;