#!/usr/bin/python -u
import os, sys, signal, time, subprocess, logging
from optparse import OptionParser
import common
from autotest_lib.scheduler import babysitter_logging_config
from autotest_lib.client.common_lib import error, global_config, utils
from autotest_lib.client.common_lib import logging_manager
from autotest_lib.scheduler import scheduler_logging_config
from autotest_lib.scheduler import monitor_db

PAUSE_LENGTH = 60
STALL_TIMEOUT = 2*60*60

parser = OptionParser()
parser.add_option("-r", action="store_true", dest="recover")
(options, args) = parser.parse_args()

autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
results_dir = os.path.join(autodir, 'results')
monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
recover = (options.recover == True)

# load logging settings
logging_manager.configure_logging(
        babysitter_logging_config.BabysitterLoggingConfig())

if len(args) != 0:
    print "Usage: %s [options]" % __file__
    print "  -r    Run recovery mode.  (Note: recovery is implicit after"
    print "        any crash!)"
    print
    sys.exit(1)


def run_banner_output(cmd):
    """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
    banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
    command_output = ''
    try:
        cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
        command_output = cmd_out.stdout + cmd_out.stderr
    except error.CmdError:
        command_output = 'Timed out'

    return banner_output % command_output


def kill_monitor():
    logging.info("Killing monitor_db")
    # try shutdown first
    utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
    if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
        # give it some time to shutdown
        time.sleep(30)
        # kill it
        utils.signal_process(monitor_db.PID_FILE_PREFIX)


def handle_sigterm(signum, frame):
    logging.info('Caught SIGTERM')
    kill_monitor()
    utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
    sys.exit(1)

signal.signal(signal.SIGTERM, handle_sigterm)


SiteMonitorProc = utils.import_site_class(
    __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
    'SiteMonitorProc', object)


class MonitorProc(SiteMonitorProc):
    def __init__(self, do_recovery=False):
        args = [monitor_db_path]
        if do_recovery:
            args.append("--recover-hosts")
        args.append(results_dir)

        kill_monitor()
        environ = os.environ
        scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
        log_name = scheduler_config.get_log_name()
        os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
        scheduler_log_dir = scheduler_config.get_server_log_dir()
        self.log_path = os.path.join(scheduler_log_dir, log_name)

        self.log_size = 0
        self.last_log_change = time.time()

        logging.info("STARTING monitor_db with log file %s" % self.log_path)
        self.args = args

        # Allow site specific code to run, set environment variables and
        # modify self.args if desired.
        super(MonitorProc, self).__init__()


    def start(self):
        devnull = open(os.devnull, 'w')
        self.proc = subprocess.Popen(self.args, stdout=devnull)


    def is_running(self):
        if self.proc.poll() is not None:
            logging.info("monitor_db DIED")
            return False

        old_size = self.log_size
        new_size = os.path.getsize(self.log_path)
        if old_size != new_size:
            logging.info("Log was touched")
            self.log_size = new_size
            self.last_log_change = time.time()
        elif self.last_log_change + STALL_TIMEOUT < time.time():
            logging.info("monitor_db STALLED")
            self.collect_stalled_info()
            return False

        return True


    def collect_stalled_info(self):
        INFO_TO_COLLECT = ['uptime',
                           'ps auxwww',
                           'iostat -k -x 2 4',
                          ]
        db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
        config = global_config.global_config
        try:
            user = config.get_config_value("BACKUP", "user")
            password = config.get_config_value("BACKUP", "password")
            db_cmd %= (user, password)
            INFO_TO_COLLECT.append(db_cmd)
        except global_config.ConfigError:
            pass
        stall_log_path = self.log_path + '.stall_info'
        log = open(stall_log_path, "w")
        for cmd in INFO_TO_COLLECT:
            log.write(run_banner_output(cmd))

        log.close()


logging.info("initializing")

if os.getuid() == 0:
    logging.critical("running as root, aborting!")
    sys.exit(1)

if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
    logging.critical("monitor_db_babysitter already running, aborting!")
    sys.exit(1)
utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)

while True:
    proc = MonitorProc(do_recovery=recover)
    proc.start()
    time.sleep(PAUSE_LENGTH)
    while proc.is_running():
        logging.info("Tick")
        time.sleep(PAUSE_LENGTH)
    recover = False
