#!/bin/bash

progname=$(basename "$0")
progdir=$(dirname "$0")

if [ ! -f "${progdir}/config_parser.sh" ] ; then
    echo "${progdir}/config_parser.sh not found." 1>&2
    exit 1
fi
source ${progdir}/config_parser.sh

config_parse
config_reveal_block "common"
config_reveal_block "infosys"
config_reveal_block "grid-manager"

# This program assumes the role of /bin/mail, so it's called like this:
# /bin/mail -s '[Condor] Condor Job <job-ID>' <email-address>
# We extract the job-ID from the second argument.
#
# NOTE: The format of the email message is, unlike the job log, not guaranteed
# to remain unchanged in future versions of Condor, but since we need the job
# id to locate the log file, there's no way around this. :-(
lrmsid=${2##*Condor Job }
condorid=${lrmsid%% *}
lrmsid=${lrmsid%%.*}.condor

CONDOR_LOCATION=$CONFIG_condor_location

# Set variable "controldir" from GM config.
controldir=$CONFIG_controldir

# Find the proper GRAMI file.
grami=$(grep -l "^joboption_jobid=$lrmsid$" "$controldir"/job.*.grami)

if [[ ! -f $grami ]]; then
    echo "$progname: No GRAMI file for job $lrmsid could be found." >&2
    exit 1
fi

# Logfile used by Grid Manager.
gmlog=$controldir/$(basename "$grami" .grami).errors

# IMPORTANT: Never change the format of this line!
# It is used in LRMS_Condor.pm to delimit job info.
echo "----- starting $progname -----" >>"$gmlog"

echo "arg 0 $0" >>"$gmlog"
echo "arg 1 $1" >>"$gmlog"
echo "arg 2 $2" >>"$gmlog"
echo "arg 3 $3" >>"$gmlog"

# Find the Condor log.
condor_log=$(sed -n 's/^condor_log=\(.*\)/\1/p' "$grami")

# Use /dev/null if we couldn't find the log.  Should never happen.
if [[ ! -f $condor_log ]]; then
    echo "$progname: couldn't find Condor log file ($condor_log)"
    echo "$progname: using /dev/null as log file"
    condor_log=/dev/null
fi >>"$gmlog" 2>&1

mbody=/tmp/mailbody.$$

# Dump mail body and Condor log into gmlog.
{
    cat >$mbody || echo "$progname: failed to write $mbody"
    echo "$progname: ----- begin condor job completion message -----"
    sed "s/^/$progname: /" $mbody
    echo "$progname: ----- end condor job completion message -----"
    echo "$progname: ----- begin condor log ($condor_log) -----"
    sed "s/^/$progname: /" "$condor_log"
    echo "$progname: ----- end condor log ($condor_log) -----"
} >>"$gmlog" 2>&1
rm -f $mbody

echo "$progname: searching for exit code in condor_history for ID $condorid" >> "$gmlog" 2>&1

jobfile="$controldir"/$(basename "$grami" .grami).local
sessiondir=$(sed -n 's/^sessiondir=\(.*\)/\1/p' $jobfile)
lrms_done="$controldir"/$(basename "$grami" .grami).lrms_done || lrms_done="/dev/stderr"

###########
#echo condor_log=$condor_log
#echo controldir=$controldir
#echo grami=$grami
#echo jobfile=$jobfile
#echo sessiondir=$sessiondir
#echo lrms_done=$lrms_done
###########

echo "Sessiondir: $sessiondir" >> "$gmlog" 2>&1
echo "lrms_done : $lrms_done" >> "$gmlog" 2>&1

if [ ! -d $sessiondir ]; then
    echo "No sessiondir!?!?" >> "$gmlog" 2>&1
    exit 1
fi

function cleanup {
  rm -f $tmphist
  # Don't remove or modify next line.
  echo "----- exiting $progname -----" >>"$gmlog"
}

trap cleanup EXIT

function retry() {
  cmd=$1
  eval "$cmd" && return $?
  while shift; do 
    sleep $1 && eval "$cmd" && return $?
  done
}

tmphist=`mktemp /tmp/hist.XXXXX`
for to in 10 20 30 60; do
    $CONDOR_LOCATION/bin/condor_history -l $condorid > $tmphist
    if [ $? == 0 ] && [ -s $tmphist ] && [ `wc -l $tmphist|awk '{print $1}'` -gt 1 ]; then
      break;
    fi
    sleep $to
done
if [ $? == 0 ] && [ -s $tmphist ] && [ `wc -l $tmphist|awk '{print $1}'` -gt 1 ]; then
    { echo "$progname: ----- begin condor history message -----"
      cat $tmphist
      echo "$progname: ----- end condor history message -----"
    } >> "$gmlog" 2>&1

    exitcode=`sed -n 's/^ExitCode *= *//p' $tmphist`
    WallTime=`sed -n 's/^RemoteWallClockTime *= *\([^.]*\).*/\1/p' $tmphist`
    KernelTime=`sed -n 's/^RemoteSysCpu *= *\([^.]*\).*/\1/p' $tmphist`
    UserTime=`sed -n 's/^RemoteUserCpu *= *\([^.]*\).*/\1/p' $tmphist`
    ImageSize=`sed -n 's/^ImageSize *= *//p' $tmphist`
    ExitStatus=`sed -n 's/^ExitStatus *= *//p' $tmphist`
    JobStatus=`sed -n 's/^JobStatus *= *//p' $tmphist`
    ExitSignal=`sed -n 's/^ExitSignal *= *//p' $tmphist`
    NodeName=`sed -n 's/^LastRemoteHost *= *"\(.*\)"[^"]*$/\1/p' $tmphist`
    RemoveReason=`sed -n 's/^RemoveReason *= *"\(.*\)"[^"]*$/\1/p' $tmphist`
    ExitReason=`sed -n 's/^ExitReason *= *"\(.*\)"[^"]*$/\1/p' $tmphist`
############
#echo from history
#echo exitcode=$exitcode
#echo WallTime=$WallTime
#echo KernelTime=$KernelTime
#echo UserTime=$UserTime
#echo ImageSize=$ImageSize
#echo ExitStatus=$ExitStatus
#echo JobStatus=$JobStatus
#echo ExitSignal=$ExitSignal
#echo NodeName=$NodeName
#echo RemoveReason=$RemoveReason
#echo ExitReason=$ExitReason
###############

    if [ ! -z "$WallTime" ] && [ ! -z "$KernelTime" ] \
    && [ ! -z "$UserTime" ] && [ "$WallTime" -gt 0 ]; then
        CPUUsage=$((100*(UserTime+KernelTime)/WallTime))
    else
        CPUUsage=0
    fi
    if [ ! -z "$exitcode" ] && [ $exitcode != "None" ]; then
      if [ $exitcode == 0 ]; then
        message=
      else
        message="Job finished with non-zero exit code"
      fi
    else
      exitcode=271
      if [ ! -z "$RemoveReason" ] && [ "$RemoveReason" != "None" ]; then
        message=$RemoveReason
      elif [ ! -z "$ExitReason" ] && [ "$ExitReason" != "None" ]; then
        message=$ExitReason
      else
        message="Unknown Condor error"
      fi
    fi

    # Check for exceeded resources limits
    if [ -s $grami ] && [ ! -z "$RemoveReason" ] && [ "$RemoveReason" != "None" ]; then
      used_walltime=$WallTime
      eval req_memory=`grep ^joboption_memory= $grami | sed s/^joboption_memory=//`
      eval req_cputime=`grep ^joboption_cputime= $grami | sed s/^joboption_cputime=//`
      eval req_walltime=`grep ^joboption_walltime= $grami | sed s/^joboption_walltime=//`

      if [ ! -z "$UserTime" ] && [ ! -z "$KernelTime" ]; then
        used_cputime=$(( UserTime + KernelTime ))
      fi
      if [ ! -z "$ImageSize" ]; then
        used_memory="$ImageSize"
      fi
      if [ ! -z "$used_memory" ] && [ ! -z "$req_memory" ] && [ "$req_memory" -gt 0 ] \
      && [ $(( 100*used_memory/1024/req_memory )) -gt 95 ]; then
        overlimit="memory"
      fi
      if [ ! -z "$used_cputime" ] && [ ! -z "$req_cputime" ] && [ "$req_cputime" -gt 0 ] \
      && [ $(( 100*used_cputime/req_cputime )) -gt 95 ]; then
        overlimit="cputime"
      fi
      if [ ! -z "$used_walltime" ] && [ ! -z "$req_walltime" ] && [ "$req_walltime" -gt 0 ] \
      && [ $(( 100*used_walltime/req_walltime )) -gt 95 ]; then
        overlimit="walltime"
      fi
      { echo ++++++++++++++++++++++++++
        echo Resources:
        echo ++++++++++++++++++++++++++
        echo req_memory=$req_memory Mb
        echo req_cputime=$req_cputime
        echo req_walltime=$req_walltime
        echo used_memory=$used_memory kB
        echo used_cputime=$used_cputime
        echo used_walltime=$used_walltime
        if [ ! -z "$overlimit" ]; then
          echo overlimit=$overlimit
        fi
        echo ++++++++++++++++++++++++++
      } >> $gmlog

      if [ -z "$overlimit" ]; then
        :
      elif [ $overlimit == "memory" ]; then
        exitcode=271
        message="job killed: vmem"
      elif [ $overlimit == "cputime" ]; then
        exitcode=271
        message="job killed: cput"
      elif [ $overlimit == "walltime" ]; then
        exitcode=271
        message="job killed: wall"
      fi

    fi
    echo $exitcode $message > $lrms_done

    { echo "WallTime=${WallTime}s"
      echo "KernelTime=${KernelTime}s"
      echo "UserTime=${UserTime}s"
      echo "CPUUsage=${CPUUsage}"
      echo "UsedMemory=${ImageSize}kB"
      echo "NodeName=${NodeName}"
      echo
      echo "exitstatus=${ExitStatus}"
      echo "jobstatus=${JobStatus}"
      echo "removereason=${RemoveReason}"
      echo "exitsignal=${ExitSignal}"
      echo "exitreason=${ExitReason}"
      echo
      echo "exitcode=$exitcode"
    } > $sessiondir.diag

else
      echo "No condor_history for Condor ID $condorid" >> $gmlog
      echo "271 No condor_history for Condor ID $condorid" > $lrms_done
fi

exit 0
