#!/opt/puppetlabs/puppet/bin/ruby

require_relative '../../ruby_task_helper/files/task_helper.rb'
require 'json'
require 'open3'

class CheckPglogicalReplicaStatus < TaskHelper
  def grab(command, error, errcode)
    cmd = Array(command)
    stdout, stderr, status = Open3.capture3(*cmd)
    result = stdout.strip
    if !status.success? || result.empty?
      raise TaskHelper::Error.new(
        error,
        "puppetlabs.check-pglogical-replica-status/#{errcode}",
        {
          command: cmd.join(' '),
          exitcode: status.exitstatus,
          stdout: stdout,
          stderr: stderr,
        },
      )
    end

    return result
  end

  def get_databases(pe_version)
    dbs = [
      'pe-activity',
      'pe-classifier',
      'pe-orchestrator',
      'pe-inventory',
      'pe-rbac',
      'pe-hac',
    ]
    dbs << 'pe-patching' if Gem::Version.new(pe_version) >= Gem::Version.new('2025.0.0')
    dbs << 'pe-infra-assistant' if Gem::Version.new(pe_version) >= Gem::Version.new('2025.2.0')
    dbs << 'pe-workflow' if Gem::Version.new(pe_version) >= Gem::Version.new('2025.5.0')
    dbs
  end

  def get_db_statuses(certname, pg_version, pe_version)
    databases = get_databases(pe_version)

    pg_certs_dir = "/opt/puppetlabs/server/data/postgresql/#{pg_version}/data/certs"

    db_replication_statuses = {}

    databases.each do |dbname|
      connection_string = [
        "dbname=#{dbname}",
        "host=#{certname}",
        'port=5432',
        "sslcert=#{pg_certs_dir}/_local.cert.pem",
        "sslkey=#{pg_certs_dir}/_local.private_key.pem",
        'sslrootcert=/etc/puppetlabs/puppet/ssl/certs/ca.pem',
        'user=pe-ha-replication',
        'sslmode=verify-full',
      ]
      command = [
        '/opt/puppetlabs/server/bin/psql',
        '-tq',
        connection_string.join(' '),
        '-c',
        'select status from pglogical.show_subscription_status();',
      ]
      db_replication_statuses[dbname] = grab(
        command,
        "Failed to lookup replications status for #{dbname}",
        'pgl-status-lookup-failure',
      )
    end

    db_replication_statuses
  end

  def get_service_statuses(certname)
    command = [
      '/opt/puppetlabs/bin/puppet-infra',
      'status',
      '--format',
      'json',
      '--host',
      certname,
    ]
    infra_status = JSON.parse(grab(
      command,
      'Failed calling `puppet-infra status`',
      'puppet-infra-status-failure',
    ))

    # The orchestrator is not running on a replica
    replicated_services = [
      'activity-service',
      'classifier-service',
      'rbac-service',
      'puppetdb-service',
    ]

    infra_status.select { |h| replicated_services.include?(h['service']) }
  end

  # Example of a status hash in a failed state:
  #
  #  {
  #    "service": "rbac-service",
  #    "state": "running",
  #    "status": {
  #      "db_up": true,
  #      "db_pool": {
  #        "state": "ready"
  #      },
  #      "activity_up": true,
  #      "replication": {
  #        "mode": "replica",
  #        "status": "down"
  #      }
  #    },
  #    "display_name": "RBAC",
  #    "server": "kl4beednc7uprav.delivery.puppetlabs.net",
  #    "url": "https://kl4beednc7uprav.delivery.puppetlabs.net:4433/rbac-api",
  #    "type": "rbac",
  #    "alerts": [
  #      {
  #        "severity": "error",
  #        "message": "Database replication for RBAC is currently down."
  #      }
  #    ]
  #  },
  def all_replicating_services_good?(services)
    services.all? do |s|
      replication_status = (s['status'] || {})['replication']
      replication_status ||= {}
      alerts = s['alerts'] || []
      error_or_warning = alerts.select { |a| ['error', 'warning'].include?(a['severity']) }

      s['state'] == 'running' &&
        replication_status['status'] == 'running' &&
        error_or_warning.empty?
    end
  end

  def task(certname: nil, pg_version: nil, pe_version: nil, timeout: 0, **_kwargs)
    certname ||= grab(
      '/opt/puppetlabs/bin/puppet config print certname',
      'Failed to determine replica certname',
      'config-print-failed',
    )
    pg_version ||= grab(
      '/opt/puppetlabs/bin/facter --no-external-facts -p pe_postgresql_info.installed_server_version',
      'Failed to get postgresql version.',
      'pg-version-check-failed',
    )
    pe_version ||= grab(
      '/opt/puppetlabs/bin/facter --no-external-facts -p pe_server_version',
      'Failed to get PE version.',
      'pe-version-check-failed',
    )

    db_replication_statuses = {}
    all_db_replicating = false
    service_statuses = {}
    all_replicating_services_good = false
    success = false
    not_yet_run = true

    started = Time.now
    while (left = (Time.now - started)) < timeout || not_yet_run
      sleep((left < 10) ? left : 10) if !not_yet_run

      # check subscription status
      db_replication_statuses = get_db_statuses(certname, pg_version, pe_version)
      all_db_replicating = db_replication_statuses.values.all? { |v| v == 'replicating' }

      # check puppet infra status
      service_statuses = get_service_statuses(certname)
      all_replicating_services_good = all_replicating_services_good?(service_statuses)

      success = all_replicating_services_good && all_db_replicating
      break if success

      not_yet_run = false
    end

    result = {
      success: success,
      all_db_replicating: all_db_replicating,
      database_replication: db_replication_statuses,
      all_replicating_services_good: all_replicating_services_good,
    }

    if !success
      result[:pe_service_statuses] = service_statuses
      result[:timeout] = timeout
      raise TaskHelper::Error.new(
        'One or more PE services failed status check for service state or logical database replication.',
        'puppetlabs.check-pglogical-replica-status/failed-status-check',
        result,
      )
    end

    result.to_json
  end
end

CheckPglogicalReplicaStatus.run if __FILE__ == $PROGRAM_NAME
