package SGE;

use File::Basename;
use lib dirname($0);
@ISA = ('Exporter');
@EXPORT_OK = ('cluster_info',
	      'queue_info',
	      'jobs_info',
	      'users_info');
use LogUtils ( 'start_logging', 'error', 'warning', 'debug' ); 
use strict;

# Changelog:
#   
#    2005-10 Juha
#        Major rewrite
#

##########################################
# Saved private variables
##########################################

our (%lrms_queue);

##########################################
# Private subs
##########################################

sub sge_env ($) {
    my ($path) = shift;
    my ($sge_root,$sge_arch) = split '/bin/', $path;
    if ( not $ENV{SGE_ROOT} ) {
#	debug("Environment variable SGE_ROOT is not set.");
#	debug("Cron jobs often have impoverished set of environment variables.");
	$ENV{SGE_ROOT} = $sge_root;
#	warning("Making a guess: SGE_ROOT=$ENV{SGE_ROOT}");
    }
    if ( not $ENV{SGE_ARCH} ) {
#	debug("Environment variable SGE_ARCH is not set.");
#	debug("Cron jobs often have impoverished set of environment variables.");
	$ENV{SGE_ARCH} = $sge_arch;
#	warning("Making a guess: SGE_ARCH=$ENV{SGE_ARCH}");
    }
}

sub type_and_version ($) {
    my ($path) = shift;

    my ($type, $version);
    my ($command) = "$path/qstat -help";

    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    my ($line) = <QSTAT>;
    ($type, $version) = split " ", $line;
    close QSTAT;
    unless ( ( $type =~ /GE/ ) and ( $version =~ /^6/ ) ) {
	warning("Unsupported LRMS type or version: $type $version.");
    }
    return $type, $version;
}

sub queues ($) {
    my ($path) = shift;
    my (@queue, $line);
    my ($command) = "$path/qconf -sql";
    unless ( open QCONF, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    while ( $line = <QCONF> ) {
	chomp $line;
	push @queue, $line;
    }
    close QCONF;
    return @queue;
}

sub cpudist (@) {
    my (@cpuarray) = @_;
    my (%cpuhash) = ();
    my ($cpudistribution) ="";
    while ( @cpuarray ) {
	$cpuhash{ pop(@cpuarray) }++; }
    while ( my ($cpus,$count)  = each %cpuhash ) {
	if ( $cpus > 0 ) { 
	    $cpudistribution .=  $cpus . 'cpu:' . $count . " ";
	}
    }
    chop $cpudistribution;
    return $cpudistribution;
}

sub slots ($) {
    my ($path) = shift;
    my ($totalcpus, $distr, $usedslots, $queued);
    my ($foo, $line, @slotarray);

    # Number of slots in execution hosts

    my ($command) = "$path/qconf -sep";

    unless ( open QQ, "$command 2>/dev/null |" ) {
	error("$command failed.");
    }
    while ( $line = <QQ> ) {
	if ( $line =~ /^HOST/ || $line =~ /^=+/ ) { next; }
	if ( $line =~ /^SUM\s+(\d+)/ ) {
	    $totalcpus = $1;
	    next;
	}
	my ($name, $ncpu, $arch ) = split " ", $line;
	push @slotarray, $ncpu;
    }
    close QQ;

    $distr = cpudist(@slotarray);

    # Used slots in all queues
    $command = "$path/qstat -g c";
    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    $usedslots = 0;
    while ( $line = <QSTAT> ) {
	if ( $line =~ /^CLUSTER QUEUE/ || $line =~ /^-+/) { next; }
	my ($name, $cqload, $used, $avail, $total, $aoACDS, $cdsuE )
	    = split " ", $line;
	$usedslots += $used;
    }
    close QSTAT;

    # Pending (queued) jobs

    $command = "$path/qstat -s p";
    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    $queued = 0;
    while ( $line = <QSTAT> ) {
	if ( ! $line =~ /^\s*\d+\s+/ ) { next; }
	my ( @tmp ) = split " ", $line;
	$queued += $tmp[-1];
    }
    close QSTAT;
    return ($totalcpus, $distr, $usedslots, $queued);
}

sub req_limits ($) {
    my ($s) = shift;
    my ($reqcputime, $reqwalltime);

    # required cputime 
    if ($s =~ /h_cpu=(\d+)/ ) { # it's in seconds?
	$reqcputime=$1/60;
    } elsif ($s =~ /s_cpu=(\d+)/ ) { # it's in seconds?
	$reqcputime=$1/60;
    } else {
	$reqcputime="";
    }

    # required walltime 
    if ($s =~ /h_rt=(\d+)/ ) { # it's in seconds?
	$reqwalltime=$1/60;
    } elsif ($s =~ /s_rt=(\d+)/ ) { # it's in seconds?
	$reqwalltime=$1/60;
    } else {
	$reqwalltime="";
    }

    return ($reqcputime, $reqwalltime);
}

############################################
# Public subs
#############################################

sub cluster_info ($) {
    my ($path) = shift;

    my (%lrms_cluster);

    sge_env( $path );

    # Figure out SGE type and version

    ( $lrms_cluster{lrms_type}, $lrms_cluster{lrms_version} )
	= type_and_version( $path );

    # Count used/free CPUs and queued jobs in the cluster
    
    # Note: SGE has the concept of "slots", which roughly corresponds to
    # concept of "cpus" in ARC (PBS) LRMS interface.

    ( $lrms_cluster{totalcpus},
      $lrms_cluster{cpudistribution},
      $lrms_cluster{usedcpus},
      $lrms_cluster{queuedcpus})
	= slots ( $path );

    # List LRMS queues
    
    @{$lrms_cluster{queue}} = queues( $path );

    return %lrms_cluster;
}

sub queue_info ($$) {
    my ($path) = shift;
    my ($qname) = shift;

    # status
    # running
    # totalcpus

    my ($command) = "$path/qstat -g c";
    unless ( open QSTAT, "$command 2> /dev/null |" ) {
	error("$command failed.");}
    my ($line, $used);
    while ( $line = <QSTAT> ) {
	if ( $line =~ /^(CLUSTER QUEUE)/  || $line =~ /^-+$/ ) {next}
	my (@a) = split " ", $line;
	if ( $a[0] eq $qname ) {
	    $lrms_queue{status} = $a[3];
	    $lrms_queue{running} = $a[2];
	    $lrms_queue{totalcpus} = $a[4];
	    last;
	}
    }
    close QSTAT;

    # Number of available (free) cpus can not be larger that
    # free cpus in the whole cluster
    my ($totalcpus, $distr, $usedslots, $queued) = slots ( $path );
    if ( $lrms_queue{status} > $totalcpus-$usedslots ) {
	$lrms_queue{status} = $totalcpus-$usedslots;
    }
    # reserve negative numbers for error states
    if ($lrms_queue{status}<0) {
	warning("lrms_queue{status} = $lrms_queue{status}")}

    # maxcputime

    # Grid Engine has hard and soft limits for both CPU time and
    # wall clock time. Nordugrid schema only has CPU time.

    $command = "$path/qconf -sq $qname";
    unless ( open QCONF, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    my ($timestr);
    while ( $line = <QCONF> ) {
	if ( $line =~ /^h_rt\s+(.*)/ && ( ! $lrms_queue{maxcputime} ) ) {
	    $timestr = $1;
	}
	if ( $line =~ /^s_rt\s+(.*)/) {
	    $timestr=$1;
	    last;
	}
    }
    close QCONF;
    if ($timestr eq 'INFINITY') {
	$lrms_queue{maxcputime} = 43200;
    } else {
	my (@a) = split ":", $timestr;
	if (@a == 3) {
	    $lrms_queue{maxcputime} = 60*$a[0]+$a[1]+int($a[2]);
	} elsif (@a == 4) {
	    $lrms_queue{maxcputime} = 24*60*$a[0]+60*$a[1]+$a[2]+int($a[3]);
	} else {
	    warning("Could not determine maxcputime for queue $qname.");
	    $lrms_queue{maxcputime} = "";
	}
    }


    # queued

    # Grid Engine puts queueing jobs in single "PENDING" state pool,
    # so here we report the total number queueing jobs in the cluster.
    # NOTE: We also violate documentation and report number
    #       or queueing slots instead of jobs

    $lrms_queue{queued} = $queued;
      
    # Is there limit for these in Grid Engine?
    # nordugrid-queue-maxrunning
    # nordugrid-queue-maxqueuable
    # nordugrid-queue-maxuserrun
    # nordugrid-queue-mincputime
    # nordugrid-queue-defaultcputime
    $lrms_queue{maxrunning} = "";
    $lrms_queue{maxqueuable} = "";
    $lrms_queue{maxuserrun} = "";
    $lrms_queue{mincputime} = "";
    $lrms_queue{defaultcput} = "";

    return %lrms_queue;
}


sub jobs_info ($$@) {
    my ($path) = shift;
    my ($qname) = shift;
    my ($jids) = shift;

    my (%lrms_jobs);

    # lrms_jobs{$id}{status}
    # lrms_jobs{$id}{node}

    # Running jobs
    my ($command) = "$path/qstat -s r -q $qname";
    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    my ($line);
    while ( $line = <QSTAT>) {
	# assume that all lines beginning with an integer are describing jobs
	if ( ! $line =~ /^\s*\d+\s+/ ) {next}
	my ($id, $prior, $name, $user, $state, $submitstart,
	    $at, $queue, $slots ) = split " ", $line;
	if (grep /^$id$/, @{$jids}) {
	    $lrms_jobs{$id}{status} = 'R';
	    my ($cluster_queue, $exec_host) = split '@', $queue;
	    $lrms_jobs{$id}{node} = $exec_host; # master node for parellel runs
	    # used walltime could be calculated from $submitstart
	    # field...
	    $lrms_jobs{$id}{walltime} = "";
	    $lrms_jobs{$id}{rank} = "";
	}
    }
    close QSTAT;


    # lrms_jobs{$id}{status}
    # lrms_jobs{$id}{rank}

    # Pending (queued) jobs
    # NOTE: Counting rank based on all queues.
    $command = "$path/qstat -s p";
    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	error("$command failed.");}
    my ($rank) = 0;
    while ( $line = <QSTAT> ) {
	# assume that all lines beginning with an integer are describing jobs
	if ( ( $line =~ /^job-ID/) || ( $line =~ /^-+/) ) {next}
	$rank++;
	my ($id, $prior, $name, $user, $state, $submitstart,
	    $at, $queue, $slots ) = split " ", $line;
	if (grep { $id == $_ } @$jids) {
	    $lrms_jobs{$id}{rank} = $rank;
	    if ( $state =~ /E/ ) {
		$lrms_jobs{$id}{status} = 'O';
	    } else { # Normally queued
		$lrms_jobs{$id}{status} = 'Q';
	    }
	}
    }
    close QSTAT;
    
    # lrms_jobs{$id}{mem}
    # lrms_jobs{$id}{walltime}
    # lrms_jobs{$id}{cputime}
    # lrms_jobs{$id}{reqwalltime}
    # lrms_jobs{$id}{reqcputime}
    # lrms_jobs{$id}{comment}

    my (@running, @queueing, @otherlrms, @notinlrms);
    foreach my $jid ( @{$jids} ) {
	if ($lrms_jobs{$jid}{status} eq 'R') {
	    push @running, $jid;
	} elsif ($lrms_jobs{$jid}{status} eq 'Q') {
	    push @queueing, $jid;
	} elsif ($lrms_jobs{$jid}{status} eq 'O') {
	    push @otherlrms, $jid;
	} else {
	    push @notinlrms, $jid;
	}
    }

    my ($jid);
    
    # Running jobs

    my ($jidstr) = join ',', @running;
    $command = "$path/qstat -j $jidstr";
    $jid = "";

    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	debug("Command $command failed.")}

    while ( $line = <QSTAT>) {

	if ( $line =~ /^job_number:\s+(\d+)/) {
	    $jid=$1;
	    $lrms_jobs{$jid}{comment} = [ "" ];
	    next;
	}

	if ( $line =~ /^usage/) {
	    # Memory usage in kB
	    # SGE reports mem, vmem and maxvmem
	    # maxvmem chosen here
	    $line =~ /maxvmem=(\d+)\.?(\d*)\s*(\w+)/;
	    my ($mult) = 1024; 
	    if ($3 eq "M") {$mult = 1024} 
	    if ($3 eq "G") {$mult = 1024*1024} 
	    $lrms_jobs{$jid}{mem} = $mult*$1 + $2*$mult/1000;
	    # used cpu time
	    $line =~ /cpu=((\d+:?)*)/;
	    my (@a) = $1 =~ /(\d+):?/g;
	    if ( @a == 4 ) {
		$lrms_jobs{$jid}{cputime} =
		    60 * ( $a[0]*24 + $a[1] ) + $a[2] ;
	    } else {
		$lrms_jobs{$jid}{cputime} =
		    60 * $a[0] + $a[1];}
	    next;
	}
	
	if ($line =~ /^hard resource_list/) {
	    ( $lrms_jobs{$jid}{reqcputime},
	      $lrms_jobs{$jid}{reqwalltime} ) =
		  req_limits($line);
	}
	next;
    }
    close QSTAT;

    # Normally queueing job

    $jidstr = join ',', @queueing;
    $command = "$path/qstat -j $jidstr";
    $jid = "";
    
    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	debug("Command $command failed.")}

    while ( $line = <QSTAT>) {

	if ( $line =~ /^job_number:\s+(\d+)/) {
	    $jid=$1;
	    next;
	}

	if ($line =~ /^hard resource_list/) {
	    ( $lrms_jobs{$jid}{reqcputime},
	      $lrms_jobs{$jid}{reqwalltime} ) =
		  req_limits($line);
	    next;
	}

	# Reason for beeing held in queue
	if ( $line =~ /^\s*(cannot run because.*)/ ) {
	    if ( exists $lrms_jobs{$jid}{comment} ) {
		push @{ $lrms_jobs{$jid}{comment} }, "LRMS: $1";
	    } else {
		$lrms_jobs{$jid}{comment} = [ "LRMS: $1" ];
	    }
	}
	next;
    }
    close QSTAT;

    # Other LRMS state, often jobs pending in error state 'Eqw'

    $jidstr = join ',', @otherlrms;
    $command = "$path/qstat -j $jidstr";
    $jid = "";

    unless ( open QSTAT, "$command 2>/dev/null |" ) {
	debug("Command $command failed.")}
    
    while ( $line = <QSTAT>) {
	
	if ( $line =~ /^job_number:\s+(\d+)/) {
	    $jid=$1;
	    $lrms_jobs{$jid}{node} = "";
	    $lrms_jobs{$jid}{mem} = "";
	    $lrms_jobs{$jid}{cputime} = "";
	    $lrms_jobs{$jid}{walltime} = "";
	    next;
	}

	if ($line =~ /^hard resource_list/) {
	    ( $lrms_jobs{$jid}{reqcputime},
	      $lrms_jobs{$jid}{reqwalltime} ) =
		  req_limits($line);
	    next;
	}

	# Error reason nro 1

	if ($line =~ /^error reason\s*\d*:\s*(.*)/ ) {
	    if ( exists $lrms_jobs{$jid}{comment} ) {
		push @{$lrms_jobs{$jid}{comment}}, "LRMS: $1";
	    } else {
		$lrms_jobs{$jid}{comment} = [ "LRMS: $1" ];
	    }
	    next;
	}

	# Let's say it once again ;-)
	if ($line =~ /(job is in error state)/ ) {
	    if ( exists $lrms_jobs{$jid}{comment} ) {
		push @{ $lrms_jobs{$jid}{comment} }, "LRMS: $1";
	    } else {
		$lrms_jobs{$jid}{comment} = [ "LRMS: $1" ];
	    }
	}
	next;
    }

# If qstat does not match, job has probably finished already
# Querying accounting system is slow, so we skip it for now ;-)
#	    my ($command) = $path/qacct -j $jid";

    return %lrms_jobs;

}

sub users_info($$@) {
    my ($path) = shift;
    my ($qname) = shift;
    my ($accts) = shift;

    my (%lrms_users);

    # freecpus
    # queue length
    #
    # This is nearly impossible to implement generally for
    # complex system such as grid engine. Using simple 
    # estimate.

    if ( ! exists $lrms_queue{status} ) {
	%lrms_queue = queue_info( $path, $qname );
    }

    foreach my $u ( @{$accts} ) {
	$lrms_users{$u}{freecpus} = $lrms_queue{status};
	$lrms_users{$u}{queuelength} = "$lrms_queue{queued}";
    }
    return %lrms_users;
}
	      


1;
