#!/bin/bash
#
#   Periodically read log files of PBS and put mark files
# for job, which finished.
#   If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
# usage: scan_pbs_job control_dir ...

if [ -z "$1" ] ; then exit 1 ; fi


# These should be set automatically
# Paths to pbs commands and logs
PBS_BIN_PATH=${PBS_BIN_PATH:-/usr/bin}
PBS_LOG_PATH=${PBS_LOG_PATH:-/var/spool/pbs/server_logs}
# Where to store temporary files
TMP_DIR=${TMP_DIR:-/tmp}

pbs_log_dir=$PBS_LOG_PATH


# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} \"$1\""
  shift
done

my_id=`id -u`

state_file=$control_dir/pbs_log_scan.`id -un`

lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( lines + 0 ))
ldate=$(( ldt + 0 ))
if [ -z "$lines_skip" ] ; then lines_skip='0' ; fi
if [ -z "$ldate" ] ; then ldate='0' ; fi

STAT_USERNAME='stat -u'
eval $STAT_USERNAME $0 2>/dev/null 1>&2
if [ ! $? = '0' ] ; then
  STAT_USERNAME='stat -c "uid (%U)"'
  eval $STAT_USERNAME $0 2>/dev/null 1>&2
  if [ ! $? = '0' ] ; then
    echo "Can't find useable stat utility"
    sleep 60
    exit 1
  fi
fi 

#
#  Main fnction for processing one PBS log.
#  Extracts log lines with code 0010 (job exited) and 0008 (job killed)
#
process_log_file () {
eval "set -- $control_dirs"
egrep '^[^;]*;0010;[^;]*;Job;|^[^;]*;0008;[^;]*;Job;[^;]*;Exit_status=|^[^;]*;0008;[^;]*;Job;[^;]*;Job deleted' ${lname} | \
{
  lines_processed='-1'
  # skip already processed lines
  while [ $lines_skip -gt '0' ] ; do
    read rest_line
    if [ $? != '0' ] ; then break ; fi
    lines_skip=$(( lines_skip - 1 ))
    lines_processed=$(( lines_processed + 1 ))
  done
  while true ; do
    read rest_line
    if [ $? != '0' ] ; then break ; fi
    echo "$rest_line" | sed 's/^[^;]*;//'
    lines_processed=$(( lines_processed + 1 ))
  done
  if [ "$lines_processed" -lt '0' ] ; then
    lines_processed=0;
  fi
  echo "$cname $lines_processed"> $state_file
} | \
sort -u | \
{
  # parse by ;
  IFS=';'
  while true ; do
    # split line into fields
    read pbs_code pbs_server pbs_job job_id job_message rest_line
    if [ "$?" != '0' ] ; then
      break
    fi
    # Try to extract exit code of PBS (note: if executable fails it's code goes to PBS)
    exit_code=`echo "$job_message" | sed 's/^.*Exit_status=\([-0-9]*\).*/\1/;t leave;s/.*//;:leave'`
    job_id=`echo "$job_id" | awk '{split($0,field,".");print field[1]"."field[2]}'`
    # look for this id in job.ID.local
    {
      for ctr_dir in "$@" ; do
        find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$job_id" 2>/dev/null
      done
    } | {
      while true ; do
        read -r name
        if [ -z "$name" ] ; then break ; fi
        if [ "$my_id" != '0' ] ; then
          if [ ! -O "$name" ] ; then continue ; fi
        fi
        base_name=`echo "$name" 2>/dev/null | sed 's/\.local$//;t leave;s/.*//;:leave'`
        if [ -z "${base_name}" ] ; then continue ; fi
	# check if job already reported
        if [ -f "${base_name}.lrms_done" ] ; then continue ; fi
	# So far only PBS exit code is available
	# It would be nice to have exit code of main executable
	exitcode=''
        # get session directory of this job
        session=`grep -h '^sessiondir=' "${base_name}.local" | sed 's/^sessiondir=\(.*\)/\1/'`
        diagfile="${session}.diag"
        commentfile="${session}.comment"
	if [ "$my_id" = '0' ] ; then
          username=`eval $STAT_USERNAME "${name}" | grep 'uid' | sed 's/[^(]*(\([^(]*\))/\1/;t leave;s/.*//;:leave'`
	else
	  username=
	fi
        if [ ! -z "$session" ] ; then
          # have chance to obtain exit code
	  if [ -z "$username" ] ; then
            exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
	  else
            exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" | sed 's/^exitcode=//'`
	  fi
	fi
	# Try to obtain message from PBS if any
        if [ -z "$username" ] ; then
  	  pbs_comment=`tail -n 1 "$commentfile"`
	else
  	  pbs_comment=`su "${username}" -c "tail -n 1 $commentfile"`
	fi
        if [ -z "$exitcode" ] ; then
          # No exit code of job means job was most probably killed
          if [ -z "$exit_code" ] ; then exit_code='-1'; fi
	  if [ "$exit_code" == '0' ] ; then 
            echo "Job $job_id failed but PBS have not noticed that" 1>&2
            echo "-1 Job failed but PBS reported 0 exit code." > "${base_name}.lrms_done"
	  elif [ -z "$pbs_comment" ] ; then
            echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
            echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
	  else
            echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
            echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
	  fi
	else
          if [ -z "$exit_code" ] ; then exit_code='-1'; fi
          if [ ! "$exitcode" = 0 ] ; then
  	    if [ "$exit_code" == '0' ] ; then exit_code='-1'; fi
            echo "Job $job_id failed with exit code $exitcode, PBS reported $exit_code." 1>&2
	    echo "$exit_code Job failed with exit code $exitcode." > "${base_name}.lrms_done"
	  else
  	    if [ ! "$exit_code" == '0' ] ; then
              echo "Job finished properly but PBS reported $exit_code." 1>&2
	      if [ -z "$pbs_comment" ] ; then
                echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
	      else
                echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
              fi
	    else
              # echo "Job finished without errors." 1>&2
              echo "0" > "${base_name}.lrms_done"
	    fi  
	  fi	    
	fi
	# wake up GM
	${ARC_LOCATION}/libexec/gm-kick "${base_name}.status"
        break 	  
      done
    }
  done
}
}

readable_logs=no
if [ ! -z "${pbs_log_dir}" ] ; then
for cname in `ls -1 ${pbs_log_dir}/ 2>/dev/null | grep '^[0-9]*$'` ; do
  lname="${pbs_log_dir}/$cname"
  if [ ! -r "$lname" ] ; then continue ; fi
  readable_logs=yes
  if [ "$cname" -lt "$ldate" ] ; then
    continue
  elif [ "$cname" -gt "$ldate" ] ; then
    lines_skip=0
  fi
  echo "Date: " $cname
  last_modified=`stat $lname | grep Modify`
  process_log_file
done
fi

if [ "$readable_logs" = 'yes' ] ; then
  time_count=0
  while true ; do
    new_modified=`stat $lname | grep Modify`
    if [ "$new_modified" != "$last_modified" ] ; then
      last_modified="$new_modified"
      lines=`cat "$state_file" 2>/dev/null`
      ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
      lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
      lines_skip=$(( lines + 0 ))
      ldate=$(( ldt + 0 ))
      process_log_file
    fi
    sleep 10
    time_count=$(( time_count + 1 ))
    if [ "$time_count" -gt 60 ] ; then break ; fi
  done
  exit 0
fi

# If no PBS logs found try ordinary 'qstat'
eval "set -- $control_dirs"
# Get all running jobs

pidslist=`mktemp "$TMP_DIR/qstat.XXXXXX"` || 
if [ ! "$?" = '0' ] ; then 
  # FS problems ?
  sleep 60
  exit 1
fi
${PBS_BIN_PATH}/qstat -a 2>/dev/null 1>"$pidslist"
if [ ! "$?" = '0' ] ; then 
  rm -f "$pidslist"
  # PBS server down ?
  sleep 60
  exit 1
fi
pids=`cat "$pidslist" | grep '^[0-9]*\.' | sed 's/^\([0-9]*\).*/\1/'`
rm -f "$pidslist"
# Go through directories
for ctr_dir in "$@" ; do
  # Obtain ids stored in job.*.local
  ids=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id$"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ ! "$status" = "INLRMS" ] ; then continue ; fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ ! -z "$session" ] ; then
      # have chance to obtain exit code
      diagfile="${session}.diag"
      if [ "$my_id" = '0' ] ; then
        username=`eval $STAT_USERNAME "${jobfile}" | grep 'uid' | sed 's/[^(]*(\([^(]*\))/\1/;t leave;s/.*//;:leave'`
      else
        username=
      fi
      if [ ! -z "$session" ] ; then
        # have chance to obtain exit code
        if [ -z "$username" ] ; then
          exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
        else
          exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" | sed 's/^exitcode=//'`
        fi
      fi
      if [ ! -z "$exitcode" ] ; then
        # job finished and exit code is known
        echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
	${ARC_LOCATION}/libexec/gm-kick "$statusfile"
        echo "Job $gridid finished with exit code $exitcode"
        continue
      fi
    fi
    # job has probaly finished and exit code is not known
    exitcode='-1'
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi
    if [ "$counter" -gt 5 ] ; then
      rm -f "$countfile"
      echo "$exitcode Job was lost with unknown exit code" > "$donefile"
      ${ARC_LOCATION}/libexec/gm-kick "$statusfile"
      echo "Job $gridid finished with unknown exit code"
    else
      echo "$counter" > "$countfile"
    fi
  done
  # go through existing ids
  for id in $pids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    # reset failure counter
    rm -f "$countfile"
  done
done
sleep 60
exit 0

