#!/usr/bin/env bash
set -euo pipefail

# ==============================================================
# Script Name: test_autovacuum_prioritization.sh
# Author: Sami Imseih
#
# Purpose:
#   Automates an OLTP + batch workload test to observe and analyze
#   PostgreSQL autovacuum prioritization behavior.
#
# What it does:
#   - Creates synthetic OLTP and batch tables.
#   - Runs concurrent pgbench workloads (updates + inserts).
#   - Monitors pg_stat_all_tables for autovacuum and analyze activity.
#   - Collects PostgreSQL logs and generates a summary report.
#
# Usage:
#   ./test_autovacuum_prioritization.sh /path/to/config_file
#
# Output:
#   All logs and temporary files are stored under $BASE_DIR/tmp/
#   Key outputs include:
#     - oltp.out / batch.out (pgbench logs)
#     - relstats_monitor.log (autovacuum stats over time)
#     - summary_report.txt (final metrics summary)
#     - pg_logs/ (PostgreSQL logs)
#
# Requirements:
#   - PostgreSQL (psql, pgbench, pg_ctl) available in PATH
#   - Sufficient permissions to restart PostgreSQL
#   - Non-production test environment
#
# --------------------------------------------------------------
# Example Config File (test_autovacuum.conf)
# --------------------------------------------------------------
# BASE_DIR="$HOME/scripts/test_autovacuum_prioritization"
# OLTP_TABLES=10
# OLTP_ROWS_START=1000
# OLTP_ROWS_MULTIPLIER=2
# OLTP_MAX_ROWS=5000000
# OLTP_CONNECTIONS=200
# OLTP_RATE=15000
# BATCH_TABLES=2
# BATCH_SIZE=1000
# BATCH_CONNECTIONS=5
# BATCH_SLEEP=20
# TIMEOUT=1800
# BUCKETS=15
# --------------------------------------------------------------
# Example Run:
#   ./test_autovacuum_prioritization.sh ./test_autovacuum.conf
# ==============================================================

CONFIG_FILE="$1"

if [ -z "$CONFIG_FILE" ] || [ ! -f "$CONFIG_FILE" ] || [ ! -r "$CONFIG_FILE" ]; then
    echo "Error: Config file not found or not readable: '$CONFIG_FILE'"
    echo "Usage: $0 /path/to/config_file"
    exit 1
fi

# Load the config
source "$CONFIG_FILE"

# ==========================================
# Temporary and log files. DO NOT TOUCH!!
# ==========================================
TMP_DIR="$BASE_DIR/tmp"
TMP_BATCH_DIR="$TMP_DIR/batch_workload"
TMP_OLTP_DIR="$TMP_DIR/oltp_workload"

OLTP_LOG="$TMP_DIR/oltp.out"
BATCH_LOG="$TMP_DIR/batch.out"
READY_FILE="$TMP_DIR/oltp_ready"
SUMMARY_FILE="$TMP_DIR/summary_report.txt"
PG_LOGS="$TMP_DIR/pg_logs"
CONFIG_LOG="$TMP_DIR/config"
RELSTATS_MONITOR_LOG="$TMP_DIR/relstats_monitor.log"

# ==========================================
# Prepare temp directory
# ==========================================
if [ -n "$TMP_DIR" ] && [ "$TMP_DIR" != "/" ]; then
    echo "Preparing temp directory at $TMP_DIR..."
    rm -rf "$TMP_DIR"
    mkdir -p "$TMP_DIR" "$PG_LOGS"
else
    echo "Invalid TMP_DIR!"
    exit 1
fi

# ==========================================
# Save the config
# ==========================================
cat > "$CONFIG_LOG" <<EOF
BASE_DIR=$BASE_DIR
OLTP_TABLES=$OLTP_TABLES
OLTP_ROWS_START=$OLTP_ROWS_START
OLTP_ROWS_MULTIPLIER=$OLTP_ROWS_MULTIPLIER
OLTP_MAX_ROWS=$OLTP_MAX_ROWS
BATCH_TABLES=$BATCH_TABLES
BATCH_SIZE=$BATCH_SIZE
BATCH_CONNECTIONS=$BATCH_CONNECTIONS
OLTP_CONNECTIONS=$OLTP_CONNECTIONS
TIMEOUT=$TIMEOUT
OLTP_RATE=$OLTP_RATE
BATCH_SLEEP=$BATCH_SLEEP
BUCKETS=$BUCKETS
EOF

# ==========================================
# OLTP Workload
# ==========================================
run_oltp() {
    local NUM_TABLES="$1"
    local NUM_ROWS="$2"
    local READY_FILE="$3"
    local CONNECTIONS="$4"
    local TIMEOUT="$5"
    local MAX_ROWS="$6"
    local MULTIPLIER="$7"
    local TMP_OLTP_DIR="$8"
    local RATE_OPT=""

    if [ -n "${OLTP_RATE:-}" ]; then
        RATE_OPT="-R${OLTP_RATE}"
        echo "Using OLTP rate limit: ${OLTP_RATE} TPS"
    fi

    echo "=== [OLTP] Creating tables ==="
    local CUR_ROWS="$NUM_ROWS"
    local TABLE_NAMES=()

    for i in $(seq 1 "$NUM_TABLES"); do
        local TABLE="table_${i}_${CUR_ROWS}"
        TABLE_NAMES+=("$TABLE")
        echo "Creating table $TABLE with $CUR_ROWS rows..."
        psql <<EOF
CREATE UNLOGGED TABLE $TABLE (
    id BIGSERIAL PRIMARY KEY,
    update_time timestamp,
    col1 TEXT,
    col2 BIGINT,
    col3 TEXT,
    col4 TEXT
);
CREATE INDEX ON $TABLE (update_time);
EOF

        echo "Populating $TABLE..."
        psql <<EOF
ALTER TABLE $TABLE SET (autovacuum_enabled = OFF);
INSERT INTO $TABLE (update_time, col1, col2, col3, col4)
SELECT
    NULL,
    md5(random()::text),
    (random()*1000000)::BIGINT,
    repeat(' ', 1000),
    repeat(' ', 1000)
FROM generate_series(1, $CUR_ROWS);
CREATE INDEX ON $TABLE (col3);
CREATE INDEX ON $TABLE (col4);
EOF

        CUR_ROWS=$((CUR_ROWS * MULTIPLIER))
        if (( CUR_ROWS > MAX_ROWS )); then
            CUR_ROWS="$NUM_ROWS"
        fi
    done

    echo "=== [OLTP] Vacuuming tables ==="
    for TABLE in "${TABLE_NAMES[@]}"; do
        echo "Vacuuming $TABLE..."
        psql -c "VACUUM FREEZE $TABLE;" \
             -c "ALTER TABLE $TABLE SET (autovacuum_enabled = ON);"
    done

    echo "=== [OLTP] Restarting PostgreSQL ==="

    # Run stop command in background
    pg_ctl stop -mf &
    pgctl_pid=$!

    # Wait for postgres to actually stop
    while pgrep -x postgres >/dev/null; do
        echo "  Waiting for shutdown..."
        sleep 1
    done

    # Optionally wait for the pg_ctl background job to finish
    wait $pgctl_pid

    echo "PostgreSQL has stopped."

    pg_ctl start
    until pg_isready -q; do
        echo "  Waiting for startup..."
        sleep 1
    done
    echo "PostgreSQL restarted."

    touch "$READY_FILE"

    echo "=== [OLTP] Generating per-table pgbench scripts ==="
    mkdir -p "$TMP_OLTP_DIR"

    CUR_ROWS="$NUM_ROWS"
    local PG_FILES=()
    for i in $(seq 1 "$NUM_TABLES"); do
        local TABLE="table_${i}_${CUR_ROWS}"
        local TABLE_FILE="$TMP_OLTP_DIR/${TABLE}.sql"

        echo "Creating pgbench script for ${TABLE} with ${CUR_ROWS} rows..."
        cat > "$TABLE_FILE" <<EOF
\set id random(1, $CUR_ROWS)
UPDATE ${TABLE} SET update_time = now() WHERE id = :id;
EOF
        PG_FILES+=("-f" "$TABLE_FILE@1")

        CUR_ROWS=$((CUR_ROWS * MULTIPLIER))
        if (( CUR_ROWS > MAX_ROWS )); then
            CUR_ROWS="$NUM_ROWS"
        fi
    done

    echo "=== [OLTP] Starting pgbench workload ==="
    pgbench -n "${PG_FILES[@]}" -c"$CONNECTIONS" -T"$TIMEOUT" $RATE_OPT
}

# ==========================================
# Batch Workload
# ==========================================
run_batch() {
    local BATCH_SIZE="$1"
    local TMP_DIR="$2"
    local NUM_TABLES="$3"
    local CONNECTIONS="$4"
    local TIMEOUT="$5"
    local RATE_OPT=""

    if [ -n "${BATCH_RATE:-}" ]; then
        RATE_OPT="-R${BATCH_RATE}"
        echo "Using Batch rate limit: ${BATCH_RATE} TPS"
    fi

    mkdir -p "$TMP_DIR"
    echo "=== [Batch] Creating tables ==="
    for i in $(seq 1 "$NUM_TABLES"); do
        local TABLE="table_batch_$i"
        psql -c "CREATE UNLOGGED TABLE IF NOT EXISTS $TABLE (
                     id BIGINT, id2 BIGINT, id3 BIGINT, id4 BIGINT,
                     c1 TEXT, c2 TEXT
                 );"
        psql -c "CREATE INDEX ON $TABLE (id);"
        psql -c "CREATE INDEX ON $TABLE (id2);"
        psql -c "CREATE INDEX ON $TABLE (c1);"
    done

    echo "=== [Batch] Creating SQL files ==="
    local PG_FILES=()
    for i in $(seq 1 "$NUM_TABLES"); do
        local TABLE="table_batch_$i"
        local PG_FILE="$TMP_DIR/${TABLE}.sql"
        cat > "$PG_FILE" <<EOF
INSERT INTO $TABLE (id, id2, id3, id4, c1, c2)
SELECT n, n, n, n, repeat(' ', 100), repeat(' ', 100)
FROM generate_series(1, $BATCH_SIZE) AS n;
\sleep ${BATCH_SLEEP} s
EOF
        PG_FILES+=("-f" "$PG_FILE@1")
    done

    echo "=== [Batch] Starting pgbench inserts ==="
    pgbench -n "${PG_FILES[@]}" -c"$CONNECTIONS" -T"$TIMEOUT"
}

# ==========================================
# Monitor relstats
# ==========================================
monitor_relstats() {
    local LOG_FILE="$1"
    local TIMEOUT="$2"
    local INTERVAL=10
    local START_TIME=$(date +%s)

    while true; do
        CURRENT_TIME=$(date +%s)
        ELAPSED=$((CURRENT_TIME - START_TIME))
        if (( ELAPSED >= TIMEOUT )); then
            echo "Monitor reached timeout (${ELAPSED}s), stopping." >> "$LOG_FILE"
            break
        fi

        psql -X -q -c "
SELECT 'TAB_DATA',
       now() timestamp,
       a.relname,
       b.reltuples::numeric AS reltuples,
       a.n_dead_tup,
       a.autovacuum_count AS av_count,
       a.autoanalyze_count AS aa_count,
       a.total_autovacuum_time AS total_av_time,
       a.total_autoanalyze_time AS total_aa_time,
       a.n_tup_upd,
       a.n_tup_hot_upd,
       a.n_tup_ins,
       -- Average autovacuum time, safely avoiding divide by zero
       ROUND(
           CASE WHEN a.autovacuum_count > 0
                THEN (a.total_autovacuum_time::numeric / a.autovacuum_count)
           ELSE 0
           END, 2
       ) AS avg_av_time,
       -- Average autoanalyze time, safely avoiding divide by zero
       ROUND(
           CASE WHEN a.autoanalyze_count > 0
                THEN (a.total_autoanalyze_time::numeric / a.autoanalyze_count)
           ELSE 0
           END, 2
       ) AS avg_aa_time
FROM pg_stat_all_tables a
JOIN pg_class b ON a.relname = b.relname
WHERE a.relname LIKE 'table_%'
ORDER BY 4 DESC;
        " >> "$LOG_FILE" 2>&1

        sleep "$INTERVAL"
    done
}

# ==========================================
# Start workloads
# ==========================================
echo "=== Starting OLTP workload in background ==="
(run_oltp "$OLTP_TABLES" "$OLTP_ROWS_START" "$READY_FILE" "$OLTP_CONNECTIONS" "$TIMEOUT" "$OLTP_MAX_ROWS" "$OLTP_ROWS_MULTIPLIER" "$TMP_OLTP_DIR" | tee "$OLTP_LOG") &
OLTP_PID=$!

echo "Waiting for OLTP ready file..."
while [ ! -f "$READY_FILE" ]; do sleep 1; done

if [[ "$BATCH_TABLES" -gt 0 ]]; then
    echo "OLTP ready detected! Starting batch workload..."
    (run_batch "$BATCH_SIZE" "$TMP_BATCH_DIR" "$BATCH_TABLES" "$BATCH_CONNECTIONS" "$TIMEOUT" | tee "$BATCH_LOG") &
    BATCH_PID=$!
else
    BATCH_PID=0
fi

echo "=== Starting Monitoring job in background ==="
(monitor_relstats "$RELSTATS_MONITOR_LOG" "$TIMEOUT") &
MONITOR_PID=$!

# ===========================================================
# Main wait loop to terminate tasks on timeout or completion
# ===========================================================
START_TIME=$(date +%s)
LAST_PRINT=0

while true; do
    CURRENT_TIME=$(date +%s)
    ELAPSED=$((CURRENT_TIME - START_TIME))
    REMAINING=$((TIMEOUT - ELAPSED))

    if (( ELAPSED - LAST_PRINT >= 30 )); then
        echo "Time remaining: ${REMAINING}s"
        LAST_PRINT=$ELAPSED
    fi

    if ! kill -0 "$OLTP_PID" 2>/dev/null; then
        echo "OLTP workload completed — stopping batch..."
        if [ "${BATCH_PID:-0}" -gt 0 ]; then
            kill -TERM "$BATCH_PID" 2>/dev/null || true
            wait "$BATCH_PID" 2>/dev/null || true
        fi
        break
    fi

    if [ "${BATCH_PID:-0}" -gt 0 ]; then
        if ! kill -0 "$BATCH_PID" 2>/dev/null; then
            echo "Batch workload completed — stopping OLTP..."
            kill -TERM "$OLTP_PID" 2>/dev/null || true
            wait "$OLTP_PID" 2>/dev/null || true
            break
        fi
    fi

    if (( ELAPSED >= TIMEOUT )); then
        echo "Timeout reached (${ELAPSED}s) — stopping workloads..."
        if [ "${BATCH_PID:-0}" -gt 0 ]; then
            kill -TERM "$OLTP_PID" "$BATCH_PID" 2>/dev/null || true
            wait "$OLTP_PID" 2>/dev/null || true
            wait "$BATCH_PID" 2>/dev/null || true
        else
            kill -TERM "$OLTP_PID" 2>/dev/null || true
            wait "$OLTP_PID" 2>/dev/null || true
        fi
        break
    fi

    sleep 1
done

echo "Both workloads finished."

echo "Stopping relstats monitor..."
kill -TERM "$MONITOR_PID" 2>/dev/null || true
wait "$MONITOR_PID" 2>/dev/null || true

# ==========================================
# Collect PostgreSQL logs
# ==========================================
cp "$PGDATA"/log/* "$PG_LOGS"/.

# ==========================================
# Summary Report
# ==========================================
{
echo "=== Database Settings ==="
psql <<EOF
SELECT name, setting
FROM pg_settings
WHERE name LIKE '%autovac%'
   OR name IN ('shared_buffers', 'max_connections');
EOF

echo "=== Total Activity ==="
psql <<EOF
\x
SELECT
    SUM(c.n_dead_tup) AS total_n_dead_tup,
    SUM(c.n_mod_since_analyze) AS total_n_mod_since_analyze,
    SUM(b.reltuples::numeric) AS total_reltuples,
    SUM(c.autovacuum_count) AS total_autovacuum_count,
    SUM(c.autoanalyze_count) AS total_autoanalyze_count,
    SUM(c.n_tup_upd) AS total_n_tup_upd,
    SUM(c.n_tup_hot_upd) AS total_n_tup_hot_upd,
    SUM(c.n_tup_newpage_upd) AS total_n_tup_newpage_upd,
    SUM(c.n_tup_ins) AS total_n_tup_ins,
    SUM(c.total_autovacuum_time) AS total_total_autovacuum_time,
    SUM(c.total_autoanalyze_time) AS total_total_autoanalyze_time,
    ROUND(
        CASE WHEN SUM(c.autovacuum_count) > 0
             THEN (SUM(c.total_autovacuum_time)::numeric / SUM(c.autovacuum_count)::numeric)
        END, 2
    ) AS avg_autovacuum_time,
    ROUND(
        CASE WHEN SUM(c.autoanalyze_count) > 0
             THEN (SUM(c.total_autoanalyze_time)::numeric / SUM(c.autoanalyze_count)::numeric)
        END, 2
    ) AS avg_autoanalyze_time,
    COUNT(*) AS table_count
FROM pg_class b
JOIN pg_stat_all_tables c ON b.oid = c.relid
WHERE UPPER(b.relname) LIKE '%TABL%'
  AND b.relname NOT LIKE 'pg_%';
EOF
} | tee "$SUMMARY_FILE"

echo "=== Stop PostgreSQL ==="
pg_ctl stop -mi

echo "==== Move logfiles ===="
mv $PGDATA/log/* "$PG_LOGS"/.;
cat $RELSTATS_MONITOR_LOG | grep TAB_DATA | sed 's/^ TAB_DATA |//' > $RELSTATS_MONITOR_LOG.clean;
cat $PG_LOGS/* | grep "LOG:  adding table:" | awk '{print $2}' FS="LOG:  adding table:" | awk '{print $1"|"$2,$3,$4}' FS="," | sed 's/ av=/|/' | sed 's/ aa=/|/' > $TMP_DIR/relstats_scores.clean

# ======================================================
# Load data into analysis database
#
# XXX: Not used in the main script flow, but kept here
# for reference.
# ======================================================
load_for_analysis()
{
    echo "=== Loading relstats data for analysis ==="

    createdb analysis

    psql analysis<<EOF
    CREATE TABLE per_table_stats (
        timestamp TIMESTAMPTZ,
        relname TEXT,
        reltuples BIGINT,
        n_dead_tup BIGINT,
        av_count INT,
        aa_count INT,
        total_av_time DOUBLE PRECISION,
        total_aa_time DOUBLE PRECISION,
        n_tup_upd BIGINT,
	n_tup_hot_upd BIGINT,
	n_tup_ins BIGINT,
        avg_av_time DOUBLE PRECISION,
        avg_aa_time DOUBLE PRECISION
);
EOF
    psql -d analysis<<EOF
    \copy per_table_stats FROM '$RELSTATS_MONITOR_LOG.clean' DELIMITER '|' CSV;
EOF
    echo "=== Loading relstats scoring data for analysis ==="
    psql analysis<<EOF
    CREATE TABLE per_table_scores (
        relname TEXT,
        score DOUBLE PRECISION,
        reason_av INT,
        reason_aa INT
    );
EOF
    psql -d analysis<<EOF
\copy per_table_scores FROM '$TMP_DIR/relstats_scores.clean' DELIMITER '|' CSV;
EOF
}

echo "All tasks completed."
