#!/bin/sh
#
#   Periodically read log files of PBS and put mark files
# for job, which finished.
#   If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
# usage: scan_pbs_job control_dir ...

if [ -z "$1" ] ; then exit 1 ; fi


# These should be set automatically
# Paths to pbs commands and logs
PBS_BIN_PATH=${PBS_BIN_PATH:-/usr/bin}
PBS_LOG_PATH=${PBS_LOG_PATH:-/var/spool/pbs/server_logs}
# Where to store temporary files
TMP_DIR=${TMP_DIR:-/tmp}

pbs_log_dir=$PBS_LOG_PATH


# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} \"$1\""
  shift
done

my_id=`id -u`

state_file=$control_dir/pbs_log_scan.`id -un`

lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( lines + 0 ))
ldate=$(( ldt + 0 ))
if [ -z "$lines_skip" ] ; then lines_skip='0' ; fi
if [ -z "$ldate" ] ; then ldate='0' ; fi

process_log_file () {
# extract log lines with code 0010 (job exited) and 0008 (job killed)
eval "set -- $control_dirs"
egrep '^[^;]*;0010;[^;]*;Job;|^[^;]*;0008;[^;]*;Job;[^;]*;Exit_status=|^[^;]*;0008;[^;]*;Job;[^;]*;Job deleted' ${lname} | \
{
  # parse by ;
  IFS=';'
  lines_processed=0
  while true ; do
    if [ $lines_skip -gt '0' ] ; then
      read rest_line
      if [ $? != '0' ] ; then break ; fi
#      (( lines_skip-- ))
#      (( lines_processed++ ))
      lines_skip=$(( lines_skip - 1 ))
      lines_processed=$(( lines_processed + 1 ))
      continue
    fi
    read pbs_time pbs_code pbs_server pbs_job job_id job_message rest_line
    if [ "$?" != '0' ] ; then
      break
    fi
    exit_code=`echo "$job_message" | sed 's/^.*Exit_status=\([-0-9]*\).*/\1/;t leave;s/.*//;:leave'`
    if [ -z "$exit_code" ] ; then
      exit_code='-1'
    fi
#    (( lines_processed++ ))
    lines_processed=$(( lines_processed + 1 ))
    job_id=`echo "$job_id" | awk '{split($0,field,".");print field[1]"."field[2]}'`
#    echo "new id:" $job_id 1>&2
    # look for this id in job.ID.local
    {
      for ctr_dir in "$@" ; do 
        grep -F -l "localid=$job_id" ${ctr_dir}/job.*.local 2>/dev/null
      done
    } | {
      while true ; do
        read -r name
        if [ -z "$name" ] ; then break ; fi
        if [ "$my_id" != '0' ] ; then
          if [ ! -O "$name" ] ; then continue ; fi
        fi
        base_name=`echo "$name" 2>/dev/null | sed 's/\.local$//;t leave;s/.*//;:leave'`
        if [ -z "${base_name}" ] ; then continue ; fi
        if [ -f "${base_name}.lrms_done" ] ; then continue ; fi
        echo "$exit_code" > "${base_name}.lrms_done"
        break 	  
      done
    }
    
#     for jobs in ${ctr_dir}/job.*.local ; do
#      if [ "$my_id" != '0' ] ; then
#        if [ ! -O "$jobs" ] ; then
#	  continue
#	fi
#      fi
#      base_name=`echo "$jobs" 2>/dev/null | sed 's/\.local$//;t leave;s/.*//;:leave'`
#      if [ -z "$base_name" ] ; then continue ; fi
#      if [ -f "${base_name}.lrms_done" ] ; then continue ; fi
#      job_is=`grep -F -c -h "localid=$job_id" "$jobs"`
#      if [ -z "$job_is" ] ; then continue ; fi
#      if [ "$job_is" -ge '1' ] ; then
#        echo "$exit_code" > "${base_name}.lrms_done"
#      fi
#     done
#    done
  done
  echo "$cname $lines_processed"> $state_file
}
}

readable_logs=no
for cname in `ls -1 ${pbs_log_dir}/ 2>/dev/null | grep '^[0-9]*$'` ; do
  lname="${pbs_log_dir}/$cname"
  if [ ! -r "$lname" ] ; then continue ; fi
  readable_logs=yes
  if [ "$cname" -lt "$ldate" ] ; then
    continue
  elif [ "$cname" -gt "$ldate" ] ; then
    lines_skip=0
  fi
  echo "Date: " $cname
  last_modified=`stat $lname | grep Modify`
  process_log_file
done

if [ "$readable_logs" = 'yes' ] ; then
  time_count=0
  while true ; do
    new_modified=`stat $lname | grep Modify`
    if [ "$new_modified" != "$last_modified" ] ; then
      last_modified="$new_modified"
      lines=`cat "$state_file" 2>/dev/null`
      ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
      lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
      lines_skip=$(( lines + 0 ))
      ldate=$(( ldt + 0 ))
      process_log_file
    fi
    sleep 10
    time_count=$(( time_count + 1 ))
    if [ "$time_count" -gt 60 ] ; then break ; fi
  done
  exit 0
fi

eval "set -- $control_dirs"
# Get all running jobs
pids=`${PBS_BIN_PATH}/qstat -a 2>/dev/null | grep '^[0-9]*\.' | sed 's/^\([^ ]*\).*/\1/'`
# Go through directories
for ctr_dir in "$@" ; do
  # Obtain ids stored in job.*.local
  ids=`grep -h '^localid=' ${ctr_dir}/job.*.local 2>/dev/null | sed 's/^localid=\([^ ]*\)/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to curent local id
    jobfile=`grep -F -l "localid=$id" ${ctr_dir}/job.*.local 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ ! "$status" = "INLRMS" ] ; then continue ; fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ ! -z "$session" ] ; then
      # have chance to obtain exit code
      diagfile="${session}.diag"
      exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
      if [ ! -z "$exitcode" ] ; then
        # job finished and exit code is known
        echo "$exitcode" > "$donefile"
        echo "Job $gridid finished with exit code $exitcode"
        continue
      fi
    fi
    # job has probaly finished and exit code is not known
    exitcode=1
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi
    if [ "$counter" -gt 5 ] ; then
      rm -f "$countfile"
      echo "$exitcode" > "$donefile"
      echo "Job $gridid finished with unknown exit code"
    else
      echo "$counter" > "$countfile"
    fi
  done
done
sleep 60
exit 0

