package LRMS_Condor;

use strict;
use warnings;

BEGIN {
    use base 'Exporter';

    # Set the version for version checking.
    our $VERSION = '1.000';

    # Generated by :r !awk '/^sub (lrms|nord|condor_location)/ { print $2 }' % | sort
    our @EXPORT = qw(
        condor_config
        condor_location
        condor_rank
        lrms_get_job_executionnodes
        lrms_get_job_status
        lrms_get_localids
        lrms_get_queued
        lrms_get_queue_status
        lrms_get_total
        nordugrid_authuser_freecpus
        nordugrid_authuser_queuelength
        nordugrid_cluster_cpudistribution
        nordugrid_cluster_lrms_type
        nordugrid_cluster_lrms_version
        nordugrid_cluster_totalcpus
        nordugrid_cluster_usedcpus
        nordugrid_job_lrmscomment
        nordugrid_job_queuerank
        nordugrid_job_reqcput
        nordugrid_job_usedcputime
        nordugrid_job_usedmem
        nordugrid_job_usedwalltime
        nordugrid_queue_defaultcputime
        nordugrid_queue_gridqueued
        nordugrid_queue_gridrunning
        nordugrid_queue_maxcputime
        nordugrid_queue_maxqueuable
        nordugrid_queue_maxrunning
        nordugrid_queue_maxuserrun
        nordugrid_queue_mincputime
        nordugrid_queue_queued
        nordugrid_queue_running
    );
}

#
# Total number of CPUs available for running jobs.
#
sub nordugrid_cluster_totalcpus {
    return (condor_status_total())[1];
}

#
# Number of CPUs that are busy, either in a job or interactive use.
#
sub nordugrid_cluster_usedcpus {
    my @total = condor_status_total();
    return $total[2] + $total[3];
}

#
# CPU distribution string (e.g., '1cpu:5 2cpu:1').
#
sub nordugrid_cluster_cpudistribution {
    # List all machines in the pool.  Machines with multiple CPUs are listed
    # one time for each CPU, with a prefix such as 'vm1@', 'vm2@', etc.
    my ($out,$err,$ret) = condor_run('bin/condor_status -format "%s\n" Name');

    # Count the number of CPUs for all machines in the pool.
    my %machines;
    for (split /\n/, $out) {
        next if /^\s*$/;
        my ($hostname) = $_ =~ /([^@]+)$/;
        $machines{$hostname}++;
    }

    # Count number of machines with one CPU, number with two, etc.
    my %dist;
    for (keys %machines) {
        $dist{$machines{$_}}++;
    }

    # Generate CPU distribution string.
    my $diststr = '';
    for (sort { $a <=> $b } keys %dist) {
        $diststr .= ' ' unless $diststr eq '';
        $diststr .= "${_}cpu:$dist{$_}";
    }

    return $diststr;
}

#
# Text string containing the LRMS type.
#
sub nordugrid_cluster_lrms_type {
    return 'Condor';
}

#
# String containing LRMS version.  ('UNKNOWN' in case of errors.)
#
sub nordugrid_cluster_lrms_version {
    my ($out, $err, $ret) = condor_run('bin/condor_version');
    return 'UNKNOWN' if $ret != 0;
    $out =~ /\$CondorVersion:\s+(\S+)/;
    return $1 || 'UNKNOWN';
}

#
# (Used to compute 'nordugrid-cluster-queuedjobs'.)
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# both Grid jobs and jobs submitted directly to the LRMS by local users.
#
sub lrms_get_queued {
    my ($out, $err, $ret) = condor_run('bin/condor_q -global');
    return 0 if $ret != 0;
    return condor_get_queued($out);
}

#
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# both Grid jobs and jobs submitted directly to the LRMS by local users.
# TODO: handle multiple queues!
#
sub nordugrid_queue_queued {
    return lrms_get_queued();
}

#
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# only Grid jobs.
# TODO: handle multiple queues!
#
sub nordugrid_queue_gridqueued {
    my ($out, $err, $ret) = condor_run('bin/condor_q');
    return 0 if $ret != 0;
    return condor_get_queued($out);
}

#
# Helper function to lrms_get_queued() and lrms_get_gridqueued().
# Takes one argument, the output of a condor_q command.
# Returns the number of queued jobs (idle + held).
#
sub condor_get_queued {
    my $out = shift;
    my $sum = 0;
    $sum += $_ for map { /(\d+) idle.*?(\d+) held/ && $1 + $2 }
                    grep { /^\d+ jobs;/ } split /\n/, $out;
    return $sum;
}

#
# (Used to compute 'nordugrid-cluster-totaljobs'.)
# Returns the number of active jobs (running) in the LRMS pool.  Counts both
# Grid jobs and jobs submitted directly to the LRMS by local users.
#
sub lrms_get_total {
    my ($out, $err, $ret) = condor_run('bin/condor_q -global');
    return 0 if $ret != 0;
    return condor_get_running($out);
}

#
# Takes one argument:  the output from condor_q.
# Counts all running jobs listed in this output.
#
sub condor_get_running {
    my $sum = 0;
    $sum += $_ for $_[0] =~ /^\d+ jobs;.*?(\d+) running,/gm;
    return $sum;
}

#
# (Used to set 'nordugrid-queue-status'.)
# Returns status of the Condor queue (active/inactive).
# TODO: handle multiple queues!
#
sub lrms_get_queue_status {
    my ($out, $err, $ret) = condor_run('bin/condor_status -total');
    return $ret == 0 ? 'active' : 'inactive';
}

#
# Returns a list of the values in the 'Total' line in condor_status -total.
# (0:'Total' 1:Machines 2:Owner 3:Claimed 4:Unclaimed 5:Matched 6:Preempting).
# On errors: (Total 0 0 0 0 0 0).
#
sub condor_status_total {
    my @failval = qw(Total 0 0 0 0 0 0);
    my ($out, $err, $ret) = condor_run('bin/condor_status -total');
    return @failval if $ret != 0;
    $out =~ /^\s*(Total.*)$/m or return @failval;
    return split ' ', $1;
}

#
# Returns total number of CPUs claimed by jobs.  (This is equal to the number
# of running jobs.)
# TODO: handle multiple queues!
#
sub nordugrid_queue_running {
    return lrms_get_total();
}

#
# Returns number of running jobs on Condor that came from the Grid.  Since
# condor_q by default lists only the jobs submitted from the machine where
# condo_q is running, and only the Grid Manager is allowed to submit from
# there, we can easily tell how many jobs belong to the Grid.
#
sub nordugrid_queue_gridrunning {
    my ($out, $err, $ret) = condor_run('bin/condor_q');
    return 0 if $ret != 0;
    return condor_get_running($out);
}

#
# Returns the number of CPUs in the Condor pool, which is always equal to the
# maximum number of running jobs.
#
sub nordugrid_queue_maxrunning {
    return nordugrid_cluster_totalcpus();
}

#
# Returns 4 * maxrunning, which is an arbitrary number.  There is (as far as I
# know) no limit on the number of queued jobs.
#
sub nordugrid_queue_maxqueuable {
    return 4 * nordugrid_queue_maxrunning();
}

#
# Returns the maximum number of jobs that a single user can run at once.
# TODO: I don't know the details as to how Condor handles this.
# TODO: handle multiple queues!
#
sub nordugrid_queue_maxuserrun {
    return nordugrid_queue_maxrunning();
}

#
# Returns ten million minutes (~19 years).  There's no limit on the CPU time in
# Condor.  (TODO: or is it?)
#
sub nordugrid_queue_maxcputime {
    return 10_000_000;
}

#
# Always returns 0, since there's no limit on the CPU time in Condor.
# (TODO: or is it?)
#
sub nordugrid_queue_mincputime {
    return 0;
}

#
# Always returns maxcputime, since there's no limit on the CPU time in Condor.
# (TODO: or is it?)
#
sub nordugrid_queue_defaultcputime {
    return nordugrid_queue_maxcputime();
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)  It's safe to pass arguments
# to condor_q by adding them before the job ID in this argument.
#
# Returns a list of fields:
#
# [0] ID: <cluster>.<proc>
# [1] Owner: <username>
# [2] Submitted: <month>/<day> <hour>:<min>
# [3] Run time: <days>+<hours>:<minutes>:<seconds>
# [4] Status: 'U' | 'H' | 'R' | 'I' | 'C' | 'X'
# [5] Priority: integer in range [-20, 20]; high numbers = high priority.
# [6] Size: virtual image size of executable in MB
# [7] Command line: <command> <arguments>...
#
sub condor_get_job_fields {
    my ($args) = $_[0] =~ /(.*)\.condor/;
    my ($out, $err, $ret) = condor_run("bin/condor_q $args");
    my @fail = ('0.0', 'nobody', '0/0 00:00', '0+0:0:0', '?', 0, 0, '');
    return @fail if $ret != 0;
    $out =~ /.+\n(.+?)$/s;
    my @fields = split ' ', $1;
    return @fail if "$args.0" !~ /(?:^|\s)$fields[0]$/;
    return ($fields[0], $fields[1], "$fields[2] $fields[3]", $fields[4],
            $fields[5], $fields[6], $fields[7],
            join ' ', @fields[8 .. $#fields]);
}

#
# (Used in 'nordugrid-job-status'.)
# Takes two arguments:
# 1. The LRMS job id as represented in the GM.  (In Condor terms,
#    it's <cluster>.condor.  <proc> is not included, since only
#    one job is submitted at a time, so <proc> is always zero.)
# 2. The 'controldir' attribute from nordugrid.conf.
#
# Returns the current status of the job.  U = unexpanded (never been run),
# H = on hold, R = running, Q = idle (waiting for a machine to execute on),
# C = completed, X = removed, and S = suspended.
#
sub lrms_get_job_status {
    my $st = (condor_get_job_fields($_[0]))[4];
    if ($st eq 'R') {
        $st = 'S' if condor_job_suspended(@_);
    } elsif ($st eq 'I') {
        $st = 'Q';
    }
    return $st;
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Queue rank is not defined in Condor, only job priority, so I just return n/a
# for now.  (TODO: use priority?)
#
sub nordugrid_job_queuerank {
    return 'n/a';
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns number of minutes of CPU time the job has consumed, rounded to the
# nearest minute.  (I.e., rounds down for seconds [0, 29] and up for [30, 59].)
#
sub nordugrid_job_usedcputime {
    my $time = (condor_get_job_fields("-cputime $_[0]"))[3];
    my ($days, $hours, $mins, $secs) = $time =~ /(\d+)\+(\d+):(\d+):(\d+)/;
    $mins += ($secs >= 30 ? 1 : 0);
    return $days * 1440 + $hours * 60 + $mins;
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns number of minutes the job has been running, rounded to the nearest
# minute.  (I.e., rounds down for seconds [0, 29] and up for [30, 59].)
#
sub nordugrid_job_usedwalltime {
    my $time = (condor_get_job_fields($_[0]))[3];
    my ($days, $hours, $mins, $secs) = $time =~ /(\d+)\+(\d+):(\d+):(\d+)/;
    $mins += ($secs >= 30 ? 1 : 0);
    return $days * 1440 + $hours * 60 + $mins;
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns virtual image size of executable in KB.
#
sub nordugrid_job_usedmem {
    return int(1000 * (condor_get_job_fields($_[0]))[6]);
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns 'n/a' always, since Condor has no notion of "requested CPU time".
# (TODO: or has it?)
#
sub nordugrid_job_reqcput {
    return 'n/a';
}

#
# (Used in 'nordugrid-job-executionnodes'.)
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns the node the job runs on, or last ran on in case the job is not
# currently running.  Only returns one node, since we don't support MPI
# jobs.
#
sub lrms_get_job_executionnodes {
    my ($cluster) = $_[0] =~ /(.*)\.condor/;
    my ($out, $err, $ret) = condor_run("bin/condor_q -long $cluster");
    my ($host) = $out =~ /^RemoteHost = "(.*)"$/m;
    return $host if $host;
    my ($lasthost) = $out =~ /^LastRemoteHost = "(.*)"$/m;
    return $lasthost if $lasthost;
    return 'UNKNOWN';
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns no useful comment yet, but I'll improve this in the future.
#
sub nordugrid_job_lrmscomment {
    return '';
}

#
# Returns a list of job IDs corresponding to 'localid' in the GM.
#
sub lrms_get_localids {
    my ($out, $err, $ret) = condor_run("bin/condor_q");
    $out =~ s/.*?CMD[ \t]*\n//s;
    $out =~ s/\n\n.*//s;
    my @localids;
    for (split /\n/, $out) {
        /^\s*(\d+)/;
        push @localids, "$1.condor";
    }
    return @localids;
}

#
# Returns number of free CPUs.
#
sub nordugrid_authuser_freecpus {
    return (nordugrid_cluster_totalcpus() - nordugrid_cluster_usedcpus());
}

#
# Returns number of jobs queued by Grid.
#
sub nordugrid_authuser_queuelength {
    return nordugrid_queue_gridqueued();
}

my %condor_runcache;

#
# Takes one argument, which is a path to an executable (relative to
# CONDOR_LOCATION) that is appended to CONDOR_LOCATION, plus optional
# arguments.  The next time this function is called with exactly the
# same argument, the return value is fetched from %condor_runcache.
#
# Returns a list of three values:
#
# [0] String containing stdout.
# [1] String containing stderr.
# [2] Program exit code ($?) that was returned to the shell.
#
sub condor_run {
    my $condorloc = condor_location();
    if (not -e $ENV{CONDOR_CONFIG}) {
        $ENV{CONDOR_CONFIG} = "$condorloc/etc/condor_config";
    }
    my $program = "$condorloc/$_[0]";
    return @{$condor_runcache{$program}} if $condor_runcache{$program};
    my $stderr_file = "/tmp/condor_run.$$";
    my $stdout = `$program 2>$stderr_file`;
    my $ret = $? >> 8;
    local *ERROR;
    open ERROR, "<$stderr_file"
      or return @{$condor_runcache{$program} = [$stdout, '', $ret]};
    local $/;
    my $stderr = <ERROR>;
    close ERROR;
    unlink $stderr_file;
    return @{$condor_runcache{$program} = [$stdout, $stderr, $ret]};
}

#
# Takes two arguments, the Condor job id and the 'controldir' attribute from
# nordugrid.conf.  This function searches controldir for the grami file that
# belongs to the given Condor job, and extracts the location of the Condor job
# from it.  This log is parsed to see if the job has been suspended.  (condor_q
# reports 'R' for running even when the job is suspended, so we need to parse
# the log to be sure that 'R' actually means running.)
#
# Returns true if the job is suspended, and false if it's running.
#
sub condor_job_suspended {
    my ($localid, $controldir) = @_;
    my $logfile = `. \$(grep -l '^joboption_jobid=$localid\$' \\
                        $controldir/job.*.grami); echo \$condor_log`;
    return 0 if !$logfile;
    chomp $logfile;
    local *LOGFILE;
    open LOGFILE, "<$logfile" or return 0;
    my $suspended = 0;
    while (my $line = <LOGFILE>) {
        $suspended = 1 if $line =~ /Job was suspended\.$/;
        $suspended = 0 if $line =~ /Job was unsuspended\.$/;
    }
    close LOGFILE;
    return $suspended;
}

{
    my ($progdir) = $0 =~ m#(.*)/#;

    # Cached location so that subsequent calls are free.
    my $location;

    sub condor_location {
        return $location if defined $location;

        my $exe;

        # Extract condor_location from nordugrid.conf.
        my $nordugrid_conf = $ENV{NORDUGRID_CONFIG} || '/etc/nordugrid.conf';
        if (-r $nordugrid_conf) {
            $location = `eval "\$(egrep '^[[:blank:]]*condor_location=' \\
                                 $nordugrid_conf)"; echo "\$condor_location"`;
            chomp $location;
            return $location if -x "$location/bin/condor_submit";
        }

        # Search for condor_submit in PATH.
        -x ($exe = "$_/condor_submit") and last for split /:/, $ENV{PATH};
        ($location) = $exe =~ m{(.*)/bin/condor_submit$} if -x $exe;
        return $location if $location;

        # Search for CONDOR_LOCATION in /etc/sysconfig/condor.
        if (-f '/etc/sysconfig/condor') {
            $location = `. /etc/sysconfig/condor; echo -n \$CONDOR_LOCATION`;
            return $location if -x "$location/bin/condor_submit";
        }

        # Use condor_master_location, if installed.
        if (-x "$progdir/condor_master_location") {
            ($location) = `$progdir/condor_master_location` =~ m{(.*)/sbin$};
            return $location if -x "$location/bin/condor_submit";
        }

        return $location = '';
    }

    my $config;

    sub condor_config {
        return $config if defined $config;

        my $nordugrid_conf = $ENV{NORDUGRID_CONFIG} || '/etc/nordugrid.conf';
        if (-r $nordugrid_conf) {
            $config = `eval "\$(egrep '^[[:blank:]]*condor_config=' \\
                               $nordugrid_conf)"; echo "\$condor_config"`;
            chomp $config;
            return $config if -r $config;
        }

        $config = condor_location() . "/etc/condor_config";
        return $config if -r $config;

        $config = $ENV{CONDOR_LOCATION} || '';
        return $config if -r $config;

        return $config = '';
    }

    sub condor_rank {
        my $nordugrid_conf = $ENV{NORDUGRID_CONFIG} || '/etc/nordugrid.conf';
        return undef if !-r $nordugrid_conf;
        my $rank = `eval "\$(egrep '^[[:blank:]]*condor_rank=' \\
                            $nordugrid_conf)"; echo "\$condor_rank"`;
        chomp $rank;
        return $rank;
    }
}

1;
