#!/bin/sh
# set -xv
#
# Periodically read log files of PBS and put mark files
# for job, which finished.
# If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
#   Adapted to SGE
#   Use only 2nd method: scan for finished jobs
#
#   Guenter Duckeck <gduckeck@lmu.de>
#   Juha Lento   <Juha.Lento@csc.fi>
#   Olli Tourunen <olli.tourunen@csc.fi>
#
# usage: scan_sge_job control_dir ...



# Set variables:
#   SGE_BIN_PATH
#
if [ -z ${NORDUGRID_LOCATION} ] ; then
    echo "NORDUGRID_LOCATION not set." 1>&2
    exit 1
fi
if [ ! -f "${NORDUGRID_LOCATION}/libexec/configure-sge-env.sh" ] ; then
    echo "${NORDUGRID_LOCATION}/libexec/configure-sge-env.sh not found." 1>&2
    exit 1
fi
source ${NORDUGRID_LOCATION}/libexec/configure-sge-env.sh

umask 022

if [ -z "$1" ] ; then exit 1 ; fi


# first control_dir is used for storing own files

echo `date`" : control_dir=$1" 1>&2 #FIXME

control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

my_id=`id -u`


# GD: no attempt to look for SGE Manager logfiles, restrict to job logs.


# Get all running jobs
#
# first running jobs, grep for MASTER to avoild slave procs
pids=`${SGE_BIN_PATH}/qstat -s r 2>/dev/null | grep MASTER | grep '^ [0-9]* ' | sed 's/^ \([^ ]*\).*/\1/'`

# now add queued jobs
pids="$pids `${SGE_BIN_PATH}/qstat -s p 2>/dev/null | grep '^ [0-9]* ' | sed 's/^ \([^ ]*\).*/\1/'`"


# Go through directories
for ctr_dir in $control_dirs ; do
  # Obtain ids of pending/running jobs stored in job.*.local
  rjobs=`find ${ctr_dir} -name 'job.*.status' -print0 2>/dev/null | xargs -0 egrep -lv 'DELETED|FINISHED' 2>/dev/null | sed s/status$/local/`
  echo `date`" : rjobs: $rjobs" 1>&2 #FIXME
  if [ -z "$rjobs" ] ; then continue ; fi
  ids=`grep -h '^localid=' $rjobs 2>/dev/null | sed 's/^localid=\([^ ]*\)/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to current local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 2>/dev/null | xargs -0 grep -F -l "localid=$id" 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ ! "$status" = "INLRMS" ] ; then continue ; fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
#GD FIXME, occassionally sessiondir in jobfile missing
    session=${session:-/usr/local/sys/nordugrid/nordugrid/sessiondir/$gridid}
    if [  -d "$session" ] ; then
#GD    if [ ! -z "$session" ] ; then
      # have chance to obtain exit code
      diagfile="${session}.diag"
      exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
      if [ ! -z "$exitcode" ] ; then
        # job finished and exit code is known
        echo "$exitcode" > "$donefile"
        echo "Job $gridid finished with exit code $exitcode"
#GD
# make copy of job script stdout
        commentfile="${ctr_dir}/job.${gridid}.comment"
	cp -p ${session}.comment $commentfile
        continue
      fi
    fi
    # job has probaly finished and exit code is not known
    exitcode=1
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi
    if [ "$counter" -gt 5 ] ; then
      rm -f "$countfile"
      echo "$exitcode" > "$donefile"
      echo "Job $gridid finished with unknown exit code"
    else
      # test again for job existence, only count if not known
      ${SGE_BIN_PATH}/qstat -j $id > /dev/null 2>&1
      if [ $? -ne 0 ]; then
	  echo "$counter" > "$countfile"
      fi
    fi
  done
done
sleep 60
exit 0

