package LRMS_Condor;

use strict;
use warnings;

BEGIN {
    use base 'Exporter';

    # Set the version for version checking.
    our $VERSION = '1.000';

    # This export list can be generated with the following Vim command:
    # :r !awk '/\<sub (lrms|nord|condor_)/ { print "       ", $2 }' % | sort
    our @EXPORT = qw(
        condor_config
        condor_get_job_field
        condor_get_queued
        condor_get_running
        condor_job_suspended
        condor_location
        condor_rank
        condor_run
        condor_status_total
        lrms_get_job_executionnodes
        lrms_get_job_executionnodes_completed
        lrms_get_job_status
        lrms_get_jobinfo_logged
        lrms_get_localids_running
        lrms_get_queued
        lrms_get_total
        nordugrid_authuser_freecpus
        nordugrid_authuser_queuelength
        nordugrid_cluster_cpudistribution
        nordugrid_cluster_lrms_type
        nordugrid_cluster_lrms_version
        nordugrid_cluster_totalcpus
        nordugrid_cluster_usedcpus
        nordugrid_job_lrmscomment
        nordugrid_job_queuerank
        nordugrid_job_reqcput
        nordugrid_job_reqwalltime
        nordugrid_job_usedcputime
        nordugrid_job_usedmem
        nordugrid_job_usedwalltime
        nordugrid_queue_defaultcputime
        nordugrid_queue_gridqueued
        nordugrid_queue_gridrunning
        nordugrid_queue_maxcputime
        nordugrid_queue_maxqueuable
        nordugrid_queue_maxrunning
        nordugrid_queue_maxuserrun
        nordugrid_queue_mincputime
        nordugrid_queue_queued
        nordugrid_queue_running
        nordugrid_queue_status
    );
}

#
# Total number of CPUs available for running jobs.
#
sub nordugrid_cluster_totalcpus {
    return (condor_status_total())[1];
}

#
# Number of CPUs that are busy, either in a job or interactive use.
#
sub nordugrid_cluster_usedcpus {
    my @total = condor_status_total();
    return $total[2] + $total[3];
}

#
# CPU distribution string (e.g., '1cpu:5 2cpu:1').
#
sub nordugrid_cluster_cpudistribution {
    # List all machines in the pool.  Machines with multiple CPUs are listed
    # one time for each CPU, with a prefix such as 'vm1@', 'vm2@', etc.
    my ($out,$err,$ret) = condor_run('bin/condor_status -format "%s\n" Name');

    # Count the number of CPUs for all machines in the pool.
    my %machines;
    for (split /\n/, $out) {
        next if /^\s*$/;
        my ($hostname) = $_ =~ /([^@]+)$/;
        $machines{$hostname}++;
    }

    # Count number of machines with one CPU, number with two, etc.
    my %dist;
    for (keys %machines) {
        $dist{$machines{$_}}++;
    }

    # Generate CPU distribution string.
    my $diststr = '';
    for (sort { $a <=> $b } keys %dist) {
        $diststr .= ' ' unless $diststr eq '';
        $diststr .= "${_}cpu:$dist{$_}";
    }

    return $diststr;
}

#
# Text string containing the LRMS type.
#
sub nordugrid_cluster_lrms_type {
    return 'Condor';
}

#
# String containing LRMS version.  ('UNKNOWN' in case of errors.)
#
sub nordugrid_cluster_lrms_version {
    my ($out, $err, $ret) = condor_run('bin/condor_version');
    return 'UNKNOWN' if $ret != 0;
    $out =~ /\$CondorVersion:\s+(\S+)/;
    return $1 || 'UNKNOWN';
}

#
# (Used to compute 'nordugrid-cluster-queuedjobs'.)
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# both Grid jobs and jobs submitted directly to the LRMS by local users.
#
sub lrms_get_queued {
    my ($out, $err, $ret) = condor_run('bin/condor_q -global');
    return 0 if $ret != 0;
    return condor_get_queued($out);
}

#
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# both Grid jobs and jobs submitted directly to the LRMS by local users.
# TODO: handle multiple queues!
#
sub nordugrid_queue_queued {
    return lrms_get_queued();
}

#
# Returns the number of queued jobs (idle and held) in the LRMS pool.  Counts
# only Grid jobs.
# TODO: handle multiple queues!
#
sub nordugrid_queue_gridqueued {
    my ($out, $err, $ret) = condor_run('bin/condor_q');
    return 0 if $ret != 0;
    return condor_get_queued($out);
}

#
# Helper function to lrms_get_queued() and lrms_get_gridqueued().
# Takes one argument, the output of a condor_q command.
# Returns the number of queued jobs (idle + held).
#
sub condor_get_queued {
    my $out = shift;
    my $sum = 0;
    $sum += $_ for map { /(\d+) idle.*?(\d+) held/ && $1 + $2 }
                    grep { /^\d+ jobs;/ } split /\n/, $out;
    return $sum;
}

#
# (Used to compute 'nordugrid-cluster-totaljobs'.)
# Returns the number of active jobs (running) in the LRMS pool.  Counts both
# Grid jobs and jobs submitted directly to the LRMS by local users.
#
sub lrms_get_total {
    my ($out, $err, $ret) = condor_run('bin/condor_q -global');
    return 0 if $ret != 0;
    return condor_get_running($out);
}

#
# Takes one argument:  the output from condor_q.
# Counts all running jobs listed in this output.
#
sub condor_get_running {
    my $sum = 0;
    $sum += $_ for $_[0] =~ /^\d+ jobs;.*?(\d+) running,/gm;
    return $sum;
}

#
# Takes one optional argument:  A string that, if defined, is placed on
# condor_status' command line, just after -total.
#
# Returns a list of the values in the 'Total' line in condor_status -total.
# (0:'Total' 1:Machines 2:Owner 3:Claimed 4:Unclaimed 5:Matched 6:Preempting).
# On errors: (Total 0 0 0 0 0 0).
#
sub condor_status_total {
    my $extra_args = (defined $_[0] ? " $_[0]" : '');
    my @failval = qw(Error 0 0 0 0 0 0);
    my ($out, $err, $ret) = condor_run("bin/condor_status -total$extra_args");
    return @failval if $ret != 0;
    $out =~ /^\s*(Total.*)$/m or return @failval;
    return split ' ', $1;
}

#
# Returns 'inactive' if condor_status fails.
#
sub nordugrid_queue_status {
    my @total = condor_status_total();
    return $total[0] eq 'Error' ? 'inactive' : 'active';
}

#
# Returns total number of CPUs claimed by jobs.  (This is equal to the number
# of running jobs.)
# TODO: handle multiple queues!
#
sub nordugrid_queue_running {
    return lrms_get_total();
}

#
# Returns number of running jobs on Condor that came from the Grid.  Since
# condor_q by default lists only the jobs submitted from the machine where
# condo_q is running, and only the Grid Manager is allowed to submit from
# there, we can easily tell how many jobs belong to the Grid.
#
sub nordugrid_queue_gridrunning {
    my ($out, $err, $ret) = condor_run('bin/condor_q');
    return 0 if $ret != 0;
    return condor_get_running($out);
}

#
# Returns the number of CPUs in the Condor pool, which is always equal to the
# maximum number of running jobs.
#
sub nordugrid_queue_maxrunning {
    return nordugrid_cluster_totalcpus();
}

#
# Returns 2 * maxrunning, which is an arbitrary number.  There is (as far as I
# know) no limit on the number of queued jobs.
#
sub nordugrid_queue_maxqueuable {
    return 2 * nordugrid_queue_maxrunning();
}

#
# Returns the maximum number of jobs that a single user can run at once.
# TODO: I don't know the details as to how Condor handles this.
# TODO: handle multiple queues!
#
sub nordugrid_queue_maxuserrun {
    return nordugrid_queue_maxrunning();
}

#
# There's no limit on the CPU time in Condor, so leave this blank.
#
sub nordugrid_queue_maxcputime {
    return '';
}

#
# There's no limit on the CPU time in Condor, so leave this blank.
#
sub nordugrid_queue_mincputime {
    return '';
}

#
# Always returns maxcputime, since there's no limit on the CPU time in Condor.
#
sub nordugrid_queue_defaultcputime {
    return nordugrid_queue_maxcputime();
}

#
# Takes two arguments:
#
#  1. A Condor ClassAd attribute name.
#  2. A Condor job ID (<ClusterId>.condor).
#
# Returns the value of the attribute named in the first argument for the job
# specified in the second argument.
#
{
    my %jobdata;

    sub condor_get_job_field {
        my $field = $_[0];
        my ($cluster) = $_[1] =~ /(.*)\.condor/;

        if (%jobdata) {
            return $jobdata{$cluster}{$field};
        }

        %jobdata = ();
        my ($out, $err, $ret) = condor_run("bin/condor_q -long");
        for my $jobdata (split /\n\n/, $out) {
            my ($clusterid) = $jobdata =~ /^ClusterId = (.*)$/m;
            for my $line (split /\n/, $jobdata) {
                my ($field, $value) = $line =~ /^(.*?) = (.*)$/m or next;
                $jobdata{$clusterid}{$field} = $value;
            }
        }
        return $jobdata{$cluster}{$field};
    }
}

#
# (Used in 'nordugrid-job-status'.)
# Takes two arguments:
# 1. The LRMS job id as represented in the GM.  (In Condor terms,
#    it's <cluster>.condor.  <proc> is not included, since only
#    one job is submitted at a time, so <proc> is always zero.)
# 2. The 'controldir' attribute from arc.conf.
#
# Returns the current status of the job by mapping Condor's JobStatus
# integer into corresponding one-letter codes used by ARC:
#
#   1 (Idle)       --> Q (job is queuing, waiting for a node, etc.)
#   2 (Running)    --> R (running on a host controlled by the LRMS)
#   2 (Suspended)  --> S (an already running job in a suspended state)
#   3 (Removed)    --> E (finishing in the LRMS)
#   4 (Completed)  --> E (finishing in the LRMS)
#   5 (Held)       --> O (other)
#
# If the job couldn't be found, E is returned since it is probably finished.
#
sub lrms_get_job_status {
    my %num2letter = qw(1 Q 2 R 3 E 4 E 5 O);
    my $s = condor_get_job_field('JobStatus', $_[0]);
    return 'E' if !defined $s;
    $s = $num2letter{$s};
    if ($s eq 'R') {
        $s = 'S' if condor_job_suspended(@_);
    }
    return $s;
}

#
# There's no easy way to define the job's queue "position" in Condor.
#
sub nordugrid_job_queuerank {
    return '';
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns number of minutes of CPU time the job has consumed, rounded to the
# nearest minute.
#
sub nordugrid_job_usedcputime {
    my $time = condor_get_job_field('RemoteUserCpu', $_[0]);
    return 0 if !defined $time;
    return sprintf "%.0f", $time / 60;
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns number of minutes the job has been allocated to a machine, rounded to
# the nearest minute.
#
sub nordugrid_job_usedwalltime {
    my $time = condor_get_job_field('RemoteWallClockTime', $_[0]);
    return 0 if !defined $time;
    return sprintf "%.0f", $time / 60;
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns virtual image size of executable in KB.
#
sub nordugrid_job_usedmem {
    my $size = condor_get_job_field('ImageSize', $_[0]);
    return defined $size ? $size : 0;
}

#
# Condor has no "requested CPU time" attribute.
#
sub nordugrid_job_reqcput {
    return '';
}

#
# Condor has no "requested walltime" attribute.
#
sub nordugrid_job_reqwalltime {
    return '';
}

#
# (Used in 'nordugrid-job-executionnodes'.)
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns the node the job runs on, or last ran on in case the job is not
# currently running.  Only returns one node, since we don't support MPI
# jobs.
#
sub lrms_get_job_executionnodes {
    my $tmp = condor_get_job_field('RemoteHost', $_[0]);
    if ($tmp) {
        my ($host) = $tmp =~ /^"(.+)"$/;
        return $host if $host;
    }
    $tmp = condor_get_job_field('LastRemoteHost', $_[0]);
    if ($tmp) {
        my ($host) = $tmp =~ /^"(.+)"$/;
        return $host if $host;
    }
    return 'UNKNOWN';
}

#
# Like lrms_get_job_executionnodes(), but this version looks at the
# job.ID.errors file to find out where completed jobs ran.
#
sub lrms_get_job_executionnodes_completed {
    my $gmlog = shift;
    local *GMLOG;
    open GMLOG, "<$gmlog" or return '';
    local $/;
    my $logdata = <GMLOG>;
    close GMLOG;
    my ($exechost) = $logdata =~ /.+Job executing on host: <([^:]+)/s;
    return $exechost || '';
}

sub lrms_get_jobinfo_logged {
    my ($jobinfo, $ctldir) = @_;

    my $e1 = '----- starting finish-condor-job -----';
    my $e2 = '.* Job executing on host: <.*>';
    my $e3 = 'Allocation/Run time:.*';
    my $e4 = 'Total Remote CPU Time:.*';
    my $tmp = `egrep -H '($e1|finish-condor-job: ($e2|$e3|$e4))\$' \\
                 $ctldir/job.*.errors`;

    for my $chunk (split /^.*:\Q$e1\E\n/m, $tmp) {
        # Currently only fetching the last host the job executed on.
        my ($exechost) =
          $chunk =~ /.*finish-condor-job:[^\n]+executing on host: <([^:]+)/s;
        next if !$exechost;

        # The GM job id.
        my ($id) = $chunk =~ /job\.([^.]+)\.errors:/;

        # Strings in the form <days> <hours>:<minutes>:<seconds>.
        my ($walltstr) = $chunk =~ m{.*Allocation/Run time:\s+([^\n]+)}s;
        my ($cputstr) = $chunk =~ m{Total Remote CPU Time:\s+([^\n]+)};

        # Convert wallclock time string to minutes.
        my ($d, $h, $m, $s) = $walltstr =~ /(\d+) (\d\d):(\d\d):(\d\d)/;
        {
            no warnings 'uninitialized';
            $m += $d * 24 * 60 + $h * 60 + $s / 60;
            $m = sprintf '%.0f', $m;
        }
        $jobinfo->{$id}{WallTime} = $m;

        # Convert CPU time string to minutes.
        ($d, $h, $m, $s) = $cputstr =~ /(\d+) (\d\d):(\d\d):(\d\d)/;
        {
            no warnings 'uninitialized';
            $m += $d * 24 * 60 + $h * 60 + $s / 60;
            $m = sprintf '%.0f', $m;
        }
        $jobinfo->{$id}{CpuTime} = $m;

        # Execution host.
        $jobinfo->{$id}{exec_host} = $exechost;

        # Required CPU time.
        $jobinfo->{$id}{reqcputime} = '';
    }
}

#
# Takes one argument, the LRMS job id as represented in the GM.  (In Condor
# terms, it's <cluster>.condor.  <proc> is not included, since only one job is
# submitted at a time, so <proc> is always zero.)
#
# Returns no useful comment yet, but I'll improve this in the future.
#
sub nordugrid_job_lrmscomment {
    return '';
}

#
# Returns a list of job IDs corresponding to 'localid' in the GM.
# Only lists currently running jobs.
#
sub lrms_get_localids_running {
    my ($out, $err, $ret) = condor_run("bin/condor_q");
    $out =~ s/.*?CMD[ \t]*\n//s;
    $out =~ s/\n\n.*//s;
    my @localids;
    for (split /\n/, $out) {
        /^\s*(\d+)/;
        push @localids, "$1.condor";
    }
    return @localids;
}

#
# Currently set to the number of free nodes.
#
sub nordugrid_authuser_freecpus {
    return nordugrid_cluster_totalcpus() - nordugrid_cluster_usedcpus();
}

#
# Returns number of jobs queued by Grid.
#
sub nordugrid_authuser_queuelength {
    return nordugrid_queue_gridqueued();
}

my %condor_runcache;

#
# Takes one argument, which is a path to an executable (relative to
# CONDOR_LOCATION) that is appended to CONDOR_LOCATION, plus optional
# arguments.  The next time this function is called with exactly the
# same argument, the return value is fetched from %condor_runcache.
#
# Returns a list of three values:
#
# [0] String containing stdout.
# [1] String containing stderr.
# [2] Program exit code ($?) that was returned to the shell.
#
sub condor_run {
    my $condorloc = condor_location();
    if (not -e $ENV{CONDOR_CONFIG}) {
        $ENV{CONDOR_CONFIG} = "$condorloc/etc/condor_config";
    }
    my $program = "$condorloc/$_[0]";
    return @{$condor_runcache{$program}} if $condor_runcache{$program};
    my $stderr_file = "/tmp/condor_run.$$";
    my $stdout = `$program 2>$stderr_file`;
    my $ret = $? >> 8;
    local *ERROR;
    open ERROR, "<$stderr_file"
      or return @{$condor_runcache{$program} = [$stdout, '', $ret]};
    local $/;
    my $stderr = <ERROR>;
    close ERROR;
    unlink $stderr_file;
    return @{$condor_runcache{$program} = [$stdout, $stderr, $ret]};
}

#
# Takes two arguments, the Condor job id and the 'controldir' attribute from
# arc.conf.  This function searches controldir for the grami file that
# belongs to the given Condor job, and extracts the location of the Condor job
# from it.  This log is parsed to see if the job has been suspended.  (condor_q
# reports 'R' for running even when the job is suspended, so we need to parse
# the log to be sure that 'R' actually means running.)
#
# Returns true if the job is suspended, and false if it's running.
#
{
    my $initialized_condor_log_db = 0;
    my %condor_log_db;

    sub condor_job_suspended {
        my ($localid, $controldir) = @_;

        # The first time condor_job_suspended() is called, the log database
        # must be initialized.
        if (!$initialized_condor_log_db) {
            $initialized_condor_log_db = 1;
            my @out = `egrep -H '^(joboption_jobid|condor_log)=' \\
                         $controldir/job.*.grami`;
            my $i = 0;
            while ($i + 1 < @out) {
                my $joboptline = $out[$i];  # joboption_jobid=...
                my $logline = $out[$i + 1]; # condor_log=...
                my ($grami, $id) =
                  $joboptline =~ /^(.+\.grami):joboption_jobid=(.*)/;
                if (!$grami || $out[$i + 1] !~ /^\Q$grami\E:/) {
                    # Grami didn't have both joboption_jobid and condor_log;
                    # Should not happen, but you never know!
                    $i++;
                    next;
                }
                my ($log) = $logline =~ /^\Q$grami\E:condor_log=(.*)/;
                $condor_log_db{$id} = $log;                
                $i += 2;
            }
        }

        my $logfile = $condor_log_db{$localid};
        return 0 if !$logfile;
        local *LOGFILE;
        open LOGFILE, "<$logfile" or return 0;
        my $suspended = 0;
        while (my $line = <LOGFILE>) {
            $suspended = 1 if $line =~ /Job was suspended\.$/;
            $suspended = 0 if $line =~ /Job was unsuspended\.$/;
        }
        close LOGFILE;
        return $suspended;
    }
}

{
    my ($progdir) = $0 =~ m#(.*)/#;

    # Cached location so that subsequent calls are free.
    my $location;

    # Determine location of arc.conf, using the environment variable
    # ARC_CONFIG, and if that fails, tries /etc/arc.conf.  For backwards
    # compatibility, it also looks for the old environment variable
    # NORDUGRID_CONFIG and the old /etc/nordugrid.conf.
    sub arc_conf {
        for ($ENV{ARC_CONFIG}, '/etc/arc.conf',
             $ENV{NORDUGRID_CONFIG}, '/etc/nordugrid.conf') {
            defined $_ && -r $_ && return $_;
        }
        return undef;
    }

    sub condor_location {
        return $location if defined $location;

        my $exe;

        my $arc_conf = arc_conf();
        if (defined $arc_conf) {
            $location = `eval "\$(egrep '^[[:blank:]]*condor_location=' \\
                                 $arc_conf)"; echo "\$condor_location"`;
            chomp $location;
            return $location if -x "$location/bin/condor_submit";
        }

        # Search for condor_submit in PATH.
        -x ($exe = "$_/condor_submit") and last for split /:/, $ENV{PATH};
        ($location) = $exe =~ m{(.*)/bin/condor_submit$} if -x $exe;
        return $location if $location;

        # Search for CONDOR_LOCATION in /etc/sysconfig/condor.
        if (-f '/etc/sysconfig/condor') {
            $location = `. /etc/sysconfig/condor; echo -n \$CONDOR_LOCATION`;
            return $location if -x "$location/bin/condor_submit";
        }

        # Use condor_master_location, if installed.
        if (-x "$progdir/condor_master_location") {
            ($location) = `$progdir/condor_master_location` =~ m{(.*)/sbin$};
            return $location if -x "$location/bin/condor_submit";
        }

        return $location = '';
    }

    my $config;

    sub condor_config {
        return $config if defined $config;

        my $arc_conf = arc_conf();
        if (defined $arc_conf) {
            $config = `eval "\$(egrep '^[[:blank:]]*condor_config=' \\
                               $arc_conf)"; echo "\$condor_config"`;
            chomp $config;
            return $config if -r $config;
        }

        $config = condor_location() . "/etc/condor_config";
        return $config if -r $config;

        $config = $ENV{CONDOR_LOCATION} || '';
        return $config if -r $config;

        return $config = '';
    }

    sub condor_rank {
        my $arc_conf = arc_conf();
        return undef if !defined $arc_conf;
        my $rank = `eval "\$(egrep '^[[:blank:]]*condor_rank=' \\
                            $arc_conf)"; echo "\$condor_rank"`;
        chomp $rank;
        return $rank;
    }
}

1;
