#!/opt/puppetlabs/puppet/bin/ruby
# frozen_string_literal: true

# rubocop::disable GetText/DecorateFunctionMessage

require 'optparse'
require 'json'
require 'time'
require 'fileutils'

# This script is intended to be run on a puppet infrastructure node as part of the
# puppet_metrics_collector module.  It will generate system/process statistics in the file format
# that other puppet_metrics_collector stats follow.  Over a time period defined as the file
# interval, it will poll the system for data based on the polling interval, then calculate an
# average from the polling data to put in the file as a single data point at the end of the file
# interval.
# sar_metric.pp will setup a cron job to run this similar to how pe_metric runs tk_metrics
#
# Example execution
# system_metrics --metric_type system_cpu --file_interval 300 --polling_interval 1
#                --metrics_dir /opt/puppetlabs/puppet-metrics-collector
# or
# system_metrics --metric_type system_processes --file_interval 300 --polling_interval 1
#                --metrics_dir /opt/puppetlabs/puppet-metrics-collector
#                --process_expression <custom expression>

# General namespace for SystemMetrics module
module SystemMetrics
  # Main class for collecting system metrics into a json file
  #
  # @author Randell Pelak
  #
  # @attr [integer] polling_interval Time in seconds between calls to poll the system for data.
  # @attr [integer] file_interval Time in seconds between the creation of each output file.
  # @attr [string] metric_type system_cpu|system_memory|system_processes
  # @attr [string] metrics_dir The puppet_metrics_collector output directory.
  # @attr [string] process_expression Expression to pass to egrep that matches processes to track
  # @attr [boolean] verbose Verbose output
  # @attr [string] hostname Name of the host the metrics are from. In directory name and json file.
  # @attr [obj] time_stamp_obj Time object to use for generating the filename
  #
  class GenerateSystemMetrics
    #
    # Initialize class
    #
    # @author Randell Pelak
    #
    # @param [integer] polling_interval Time in seconds between calls to poll the system for data.
    # @param [integer] file_interval Time in seconds between the creation of each output file.
    # @param [string] metric_type system_cpu|system_memory|system_processes
    # @param [string] process_expression Expression to pass to egrep that matches processes to track
    # @param [string] metrics_dir The puppet_metrics_collector output directory.
    # @param [boolean] verbose Verbose output
    #
    # @return [void]
    def initialize(polling_interval, file_interval, metric_type, process_expression, metrics_dir,
                   verbose = false, print_output = false)
      @polling_interval = polling_interval
      @file_interval = file_interval
      @metric_type = metric_type
      @process_expression = process_expression
      @metrics_dir = metrics_dir
      @verbose = verbose
      @print_output = print_output

      @hostname = `hostname`.strip
      puts "Hostname is: #{@hostname}" if @verbose
      FileUtils.mkdir_p(@metrics_dir) unless File.directory?(@metrics_dir)
      # The time object is set after the sar run for consistency
      @time_stamp_obj = nil
    end

    # Run sar to collect the raw system data
    #
    # @author Randell Pelak
    #
    # @return [string] raw output from sar
    def run_sar
      times_to_poll = (@file_interval / @polling_interval).round
      # sar inputs are polling interval and how many times to poll
      comm_flags = ' -r' if %r{system_memory}.match?(@metric_type)
      comm = "sar #{comm_flags} #{@polling_interval} #{times_to_poll}"
      puts "sar command is: #{comm}" if @verbose
      begin
        `#{comm}`
      rescue Exception => e
        error_msg = [e.class, e.message].join ' '
        send_error_to_output_file_and_exit(error_msg)
      end
    end

    # Parse the sar output and extract the metrics data
    #
    # @author Randell Pelak
    #
    # @param [array] sar_output
    #
    # @return [hash] The metrics data
    def parse_sar_output(sar_output)
      sar_output_arr = sar_output.split(%r{\n+|\r+}).reject(&:empty?).map { |line| line.split }

      unique_header_str = if @metric_type == 'system_memory'
                            '%memused'
                          else
                            '%user'
                          end
      headers_line = sar_output_arr.find { |e| e.include? unique_header_str }
      sar_error_missing_headers = <<-EOF
        sar output invalid or missing headers. Failed to find line with #{unique_header_str}.
        Full output:
        #{sar_output}
      EOF
      send_error_to_output_file_and_exit(sar_error_missing_headers) if headers_line.nil?

      averages_line = sar_output_arr.find { |e| e.include? 'Average:' }
      sar_error_missing_averages = <<-EOF
        sar output missing "Average:"
        Full output:
        #{sar_output}"
      EOF
      send_error_to_output_file_and_exit(sar_error_missing_averages) if averages_line.nil?

      Hash[headers_line.reverse.zip(averages_line.reverse).reverse]

      puts "sar headers and averages:\n#{headers_line.join(',')}\n#{averages_line.join(',')}" if @verbose

      # example of array data
      # 04:59:13,PM,CPU,%user,%nice,%system,%iowait,%steal,%idle
      # Average:,all,0.58,0.00,0.08,0.00,0.00,99.33
      # combine the arrays into a hash starting from the deal with the unmatched columns in the front
      data_hash = Hash[headers_line.reverse.zip(averages_line.reverse).reverse]
      # remove anything that doesn't have a number for an average like "Average:" or "all"
      data_hash.select! { |_k, v| v =~ %r{\A[-+]?[0-9]*\.?[0-9]+\Z} }
      data_hash.transform_values!(&:to_f)
    end

    # Run pidstat to collect process specific data
    #
    # @author Randell Pelak
    #
    # @param [hash] processes_to_monitor Key:PID Value:command(spaces allowed)
    #
    # @return [string] raw output from pidstat
    def run_pidstat(processes_to_monitor)
      process_ids = processes_to_monitor.keys.join(',')
      times_to_poll = (@file_interval / @polling_interval).round
      # pidstat inputs are process_ids polling interval and how many times to poll
      comm = "pidstat -Irud -p #{process_ids}  #{@polling_interval} #{times_to_poll}"
      puts "pidstat command is: #{comm}" if @verbose
      begin
        `#{comm}`
      rescue Exception => e
        error_msg = [e.class, e.message].join ' '
        send_error_to_output_file_and_exit(error_msg)
      end
    end

    # Use ps and grep to get the process id and command of the processes we want to monitor
    #
    # @author Randell Pelak
    #
    # @return [hash] processes and commands to monitor. Key:PID Value:command(spaces allowed)
    def find_processes_to_monitor
      processes_to_monitor = {}
      comm = "ps -eo pid,cmd | grep -E \'#{@process_expression}\'"
      # we don't want to track PMC scripts or the grep command itself
      comm += "| grep -vE 'puppet-metrics-collector|grep|system_metrics'"
      puts "process grep command is: #{comm}" if @verbose
      begin
        process_info = `#{comm}`
      rescue Exception => e
        error_msg = [e.class, e.message].join ' '
        send_error_to_output_file_and_exit(error_msg)
      end
      process_info.each_line do |line|
        # use the PID as key and command with args as value
        # string will be of the form:
        # <pid> <command>
        # Where command can and usually does have spaces in it.
        processes_to_monitor[line.split(' ').first] = line.split(' ')[1..-1].join(' ')
      end
      processes_to_monitor
    end

    # Parse the pidstat output and extract the metrics data, add in processes information
    #
    # Output to be parsed should look like this (plus some extra above it)
    #
    # Average:      UID       PID    %usr %system  %guest    %CPU   CPU  Command
    # Average:      996     14537    0.00    0.00    0.00    0.00     -  puma
    # Average:      990      2803    0.00    0.00    0.00    0.00     -  java
    #
    # Average:      UID       PID  minflt/s  majflt/s     VSZ    RSS   %MEM  Command
    # Average:      996     14537      0.00      0.00  630672  35560   0.23  puma
    # Average:      990      2803      1.33      0.00 7634052 1150060   7.34  java
    #
    # Average:      UID       PID   kB_rd/s   kB_wr/s kB_ccwr/s  Command
    # Average:      996     14537      0.00      0.00      0.00  puma
    # Average:      990      2803      0.00      1.33      0.00  java
    #
    # @author Randell Pelak
    #
    # @param [array] pidstat_output
    # @param [hash] processes_to_monitor Key:PID Value:command(spaces allowed)
    #
    # @return [hash] The metrics data
    def parse_pidstat_output(pidstat_output, processes_to_monitor)
      pidstat_output_arr = pidstat_output.split(%r{\n+|\r+}).reject(&:empty?)

      data_hash = {}
      fields = Array.new(9, 'bogus') # make it clear something went wrong if this ends up in the file
      pidstat_output_arr.each do |line|
        if %r{^Average:\s+UID}.match?(line) # find a header row
          fields = line.split(' ').drop(3)
        end
        next unless %r{^Average:\s+\d}.match?(line) # find a row with averages data
        value_array = line.split(' ')
        values = value_array.drop(3)
        pid = value_array[2]
        line_data = Hash[fields.zip(values)]
        command_pidstat = line_data['Command']
        # remove anything that doesn't have a number for the value like "-" or "puma"
        line_data.select! { |_k, v| v =~ %r{\A[-+]?[0-9]*\.?[0-9]+\Z} }
        line_data.transform_values!(&:to_f)
        line_data['command_pidstat'] = command_pidstat
        data_hash[pid] = {} if data_hash[pid].nil?
        data_hash[pid].merge!(line_data)
      end

      # replace key with unique command identifier
      # add the full command string
      # add the pid that was the key as a data item
      unknown_count = 0
      processes_to_monitor.each do |k, v|
        next unless data_hash.key?(k)
        comm_identifier = determine_relevant_command_name(v, data_hash[k]['command_pidstat'])
        if comm_identifier == 'unknown' # needs to be unique so adding a counter
          unknown_count += 1
          comm_identifier = "#{comm_identifier}_#{unknown_count}"
        end
        data_hash[k].merge!('command_full' => v, 'pid' => k)
        data_hash[comm_identifier] = data_hash.delete(k)
      end

      data_hash
    end

    # Determine the relevant command name for the process based on command information
    # If not able, return unknown
    #
    # @author Randell Pelak
    #
    # @param [string] command_full full command text from ps
    # @param [string] command_pidstat pidstat short command string
    #
    # @return [string] The service name or unknown
    def determine_relevant_command_name(command_full, command_pidstat)
      case command_pidstat
      when %r{^java$}
        case command_full
        when %r{puppetserver}
          return 'puppetserver'
        when %r{puppetdb}
          return 'puppetdb'
        when %r{console-services}
          return 'console'
        when %r{orchestration-services}
          return 'orchestrator'
        end
      when %r{^postgres$}
        return 'postgres'
      when %r{^pxp-agent$}
        return 'pxp-agent'
      when %r{^puma$}
        case command_full
        when %r{bolt-server}
          return 'bolt'
        when %r{ace-server}
          if %r{ruby}.match?(command_full)
            return 'ace-cleanup'
          end
          return 'ace'
        end
      when %r{^nginx$}
        return 'nginx'
      end
      'unknown'
    end

    # Generate metrics data for the case where there are no processes to monitor on the host
    #
    # @author Randell Pelak
    #
    # @return [hash] The metrics data
    def generate_no_processes_metrics_data
      warning_msg = 'No processes matching the process_expression found on the host to monitor'
      { warning: warning_msg, process_expression: @process_expression }
    end

    # Create the file and put the json data with the error in it
    # then exit
    #
    # @author Randell Pelak
    #
    # @param [string] error_msg Error message to put in the json
    #
    # @return [void]
    def send_error_to_output_file_and_exit(error_msg)
      # Time object is created after successful sar run
      # so could be nil when an error occurs. But the file needs a name based on a timestamp
      @time_stamp_obj = Time.now if @time_stamp_obj.nil?
      metrics_json = metrics_to_json(error: error_msg)
      write_metrics_to_file(metrics_json)
      exit 1
    end

    # Create the metrics json
    #
    # @author Randell Pelak
    #
    # @param [hash] metrics_data the data for the metrics section of the jason
    #
    # @return [string] json
    def metrics_to_json(metrics_data)
      hostkey = @hostname.tr('.', '-')
      metrics_json = { 'timestamp' => @time_stamp_obj.utc.iso8601, 'servers' => {} }
      metrics_json['servers'][hostkey] = { @metric_type => metrics_data }
      JSON.generate(metrics_json)
    end

    # Create the metric file and put the json data in it
    #
    # @author Randell Pelak
    #
    # @param [string] json_dataset data in json format to put in file
    #
    # @return [void]
    def write_metrics_to_file(json_dataset)
      filename = @time_stamp_obj.utc.strftime('%Y%m%dT%H%M%SZ') + '.json'
      dirname = "#{@metrics_dir}/#{@metric_type}/#{@hostname}"
      file_path = "#{dirname}/#{filename}"
      FileUtils.mkdir_p(dirname) unless File.directory?(dirname)
      puts "Creating json file: #{file_path}" if @verbose
      File.write(file_path, json_dataset)
    end

    # Generate the system data
    #
    # @author Randell Pelak
    #
    # @return [hash] metrics data
    def generate_system_metrics
      sar_output = run_sar
      # time stamp generated after sar run to be consistent with
      # pe metrics from puppet metrics collector
      @time_stamp_obj = Time.now

      parse_sar_output(sar_output)
    end

    # Generate the process data
    #
    # @author Randell Pelak
    #
    # @return [void]
    def generate_process_metrics
      processes_to_monitor = find_processes_to_monitor
      num_of_processes = processes_to_monitor.keys.length
      if num_of_processes < 1
        # sleep to simulate a normal run so that the file names based on timestamp line up
        times_to_poll = (@file_interval / @polling_interval).round
        sleep @polling_interval * times_to_poll
        metrics_data = generate_no_processes_metrics_data
      else
        pidstat_output = run_pidstat(processes_to_monitor)
      end
      # time stamp generated after pidstat run to be consistent with
      # pe metrics from puppet metrics collector
      @time_stamp_obj = Time.now

      unless num_of_processes < 1
        metrics_data = parse_pidstat_output(pidstat_output, processes_to_monitor)
      end

      metrics_data
    end

    # Get the data and create the metric json file
    #
    # @author Randell Pelak
    #
    # @return [void]
    def generate_metrics
      metrics_data = if @metric_type == 'system_processes'
                       generate_process_metrics
                     else
                       generate_system_metrics
                     end

      metrics_json = metrics_to_json(metrics_data)

      STDOUT.puts(metrics_json) if @print_output
      write_metrics_to_file(metrics_json)
    end
  end
end

if $PROGRAM_NAME == __FILE__

  VALID_METRIC_TYPES = ['system_cpu', 'system_memory', 'system_processes'].freeze
  FILE_INTERVAL_DEFAULT = 60 * 5
  POLLING_INTERVAL_DEFAULT = 1
  METRIC_TYPE_DEFAULT = 'system_cpu'
  PROCESS_EXPRESSION_DEFAULT = '/opt/puppetlabs.*(java|postgres)|puma.*/etc/puppetlabs/(ace|bolt)'
  METRICS_DIR_DEFAULT = '/opt/puppetlabs/puppet-metrics-collector'

  DESCRIPTION = <<-DESCRIPTION
    This script is intended to be run on a puppet infrastructure node as part of the
    puppet_metrics_collector module.  It will generate system statistics in the file format that
    other puppet_metrics_collector stats follow.  It will poll the system for data at a given
    interval, and then output the average to a file once per given file interval.
  DESCRIPTION

  DEFAULTS = <<-DEFAULTS
    The following default values are used if the options are not specified:
      * polling_interval (-p, --polling_interval): #{POLLING_INTERVAL_DEFAULT}
      * file_interval (-f, --file_interval): #{FILE_INTERVAL_DEFAULT}
      * metric_type (-t, --metric_type): #{METRIC_TYPE_DEFAULT}
      * process_expression (-e, --process_expression): #{PROCESS_EXPRESSION_DEFAULT}
      * metrics_dir (-m, --metrics_dir): #{METRICS_DIR_DEFAULT}
      * verbose (-v, --verbose): False
      * Print to STDOUT (--[no-]print): False
  DEFAULTS

  options = {}

  OptionParser.new { |opts|
    opts.banner = 'Usage: system_metrics [options]'

    opts.on('-h', '--help', 'Display the help text') do
      puts DESCRIPTION
      puts opts
      puts DEFAULTS
      exit
    end

    opts.on('-p', '--polling_interval seconds', Integer,
            'Time in seconds between calls to poll the system for data.') do |interval|
      options[:polling_interval] = interval
    end
    opts.on('-f', '--file_interval seconds', Integer,
            'Time in seconds between the creation of each output file.') do |interval|
      options[:file_interval] = interval
    end
    opts.on('-t', '--metric_type type', String,
            "One of: #{VALID_METRIC_TYPES.join(', ')}") do |type|
      options[:metric_type] = type.downcase
    end
    opts.on('-e', '--process_expression expression_list', String,
            'Expression to pass to egrep that matches processes to track') do |expression_list|
      options[:process_expression] = expression_list.downcase
    end
    opts.on('-m', '--metrics_dir dir_path', String,
            'The puppet_metrics_collector output directory') do |metrics_dir|
      options[:metrics_dir] = metrics_dir
    end
    opts.on('-v', '--verbose', String, 'Enable Verbose output') { options[:verbose] = true }
    opts.on('--[no-]print', 'Print to STDOUT') { |p| options[:print] = p }
  }.parse!

  if options[:metric_type]
    unless VALID_METRIC_TYPES.include?(options[:metric_type])
      options_error = "Invalid metric type #{options[:metric_type]}." +
                      " Must be one of: #{VALID_METRIC_TYPES.join(', ')}."
      raise options_error
    end
  end

  polling_interval = options[:polling_interval] || POLLING_INTERVAL_DEFAULT
  file_interval = options[:file_interval] || FILE_INTERVAL_DEFAULT
  metric_type = options[:metric_type] || METRIC_TYPE_DEFAULT
  process_expression = options[:process_expression] || PROCESS_EXPRESSION_DEFAULT
  metrics_dir = options[:metrics_dir] || METRICS_DIR_DEFAULT
  verbose = options[:verbose] || false
  print_output = options[:print] || false

  if options[:polling_interval] || options[:file_interval]
    options_error = 'Polling interval must be less than file interval'
    raise options_error unless polling_interval < file_interval
  end

  if verbose
    OPTION_SETTINGS = <<-SETTINGS
      The following are the resulting options settings:
        * polling_interval: #{polling_interval}
        * file_interval: #{file_interval}
        * metric_type: #{metric_type}
        * process_expression #{process_expression}
        * metrics_dir: #{metrics_dir}
        * verbose: #{verbose}
        * print: #{print_output}
    SETTINGS
    puts OPTION_SETTINGS
  end

  obj = SystemMetrics::GenerateSystemMetrics.new(polling_interval, file_interval, metric_type,
                                                 process_expression, metrics_dir, verbose, print_output)
  obj.generate_metrics
end
