#!/bin/sh

progname=$(basename "$0")

# This program assumes the role of /bin/mail, so it's called like this:
# /bin/mail -s '[Condor] Condor Job <job-ID>' <email-address>
# We extract the job-ID from the second argument.
#
# NOTE: The format of the email message is, unlike the job log, not guaranteed
# to remain unchanged in future versions of Condor, but since we need the job
# id to locate the log file, there's no way around this. :-(
lrmsid=${2##*Condor Job }
lrmsid=${lrmsid%%.*}.condor

[[ -r $ARC_CONFIG ]] || ARC_CONFIG=/etc/arc.conf

# This is for backwards compatibility (arc.conf used to be nordugrid.conf).
[[ -r $ARC_CONFIG ]] || ARC_CONFIG=$NORDUGRID_CONFIG
[[ -r $ARC_CONFIG ]] || ARC_CONFIG=/etc/nordugrid.conf

if [[ ! -r $ARC_CONFIG ]]; then
    echo "$progname: Couldn't find arc.conf!" >&2
    exit 1
fi

# Set variable "controldir" from GM config.
eval "$(grep '^[[:blank:]]*controldir=' "$ARC_CONFIG")"

# Find the proper GRAMI file.
grami=$(grep -l "^joboption_jobid=$lrmsid$" "$controldir"/job.*.grami)

if [[ ! -f $grami ]]; then
    echo "$progname: No GRAMI file for job $lrmsid could be found." >&2
    exit 1
fi

# Logfile used by Grid Manager.
gmlog=$controldir/$(basename "$grami" .grami).errors

# IMPORTANT: Never change the format of this line!
# It is used in LRMS_Condor.pm to delimit job info.
echo "----- starting $progname -----" >>"$gmlog"

# Find the Condor log.
condor_log=$(sed -n 's/^condor_log=\(.*\)/\1/p' "$grami")

# Use /dev/null if we couldn't find the log.  Should never happen.
if [[ ! -f $condor_log ]]; then
    echo "$progname: couldn't find Condor log file ($condor_log)"
    echo "$progname: using /dev/null as log file"
    condor_log=/dev/null
fi >>"$gmlog" 2>&1

mbody=/tmp/mailbody.$$
trap "rm -f $mbody" EXIT

# Dump mail body and Condor log into gmlog.
{
    cat >$mbody || echo "$progname: failed to write $mbody"
    echo "$progname: ----- begin condor job completion message -----"
    sed "s/^/$progname: /" $mbody
    echo "$progname: ----- end condor job completion message -----"
    echo "$progname: ----- begin condor log ($condor_log) -----"
    sed "s/^/$progname: /" "$condor_log"
    echo "$progname: ----- end condor log ($condor_log) -----"
} >>"$gmlog" 2>&1

echo "$progname: searching for exit code in $condor_log"
failure_reason=
i=0
maxtries=4
while ((i < maxtries)); do
    # Extract the program exit code from the Condor log.
    exitcode=$(sed -n '/Normal termination/{s/.*value \([0-9]*\).*/\1/p;q;}' \
                "$condor_log")

    if [[ -z $exitcode ]]; then
        exitcode=$(sed -n \
                    '/Abnormal termination/{s/.*signal \([0-9]*\).*/\1/p;q;}' \
                    "$condor_log")
        [[ $exitcode ]] && ((exitcode = exitcode + 128))
    fi

    if [[ $exitcode ]]; then
        break
    fi

    ((i = i + 1))
    echo "$progname: failed to get exit code (attempt $i/$maxtries)"
    sleep 15
done >>"$gmlog" 2>&1

if ((i == maxtries)); then
    echo "$progname: giving up on log; trying condor job completion message"
    exitcode=$(sed -n 's/.*has exited.*with status \([0-9]*\).*/\1/p' $mbody)
    if [[ -z $exitcode ]]; then
        exitcode=$(sed -n 's/.*has died on signal \([0-9]*\).*/\1/p' $mbody)
        [[ $exitcode ]] && ((exitcode = exitcode + 128))
    fi
fi >>"$gmlog" 2>&1

if [[ $exitcode ]]; then
    echo "$progname: job $lrmsid finished with status $exitcode"
    sessiondir=$(sed -n 's/^sessiondir=\(.*\)/\1/p' \
                    "$controldir"/$(basename "$grami" .grami).local)
    if [[ $sessiondir ]]; then
        echo "exitcode=$exitcode" >>"$sessiondir.diag"
    fi
    if [[ $exitcode != 0 ]]; then
        failure_reason='Job finished with non-zero exit code'
    fi
else
    echo "$progname: failed to get exit code; using 255 as fallback"
    failure_reason='Failed to get exit code of job'
    exitcode=255
fi >>"$gmlog" 2>&1

# Write exit code in job.ID.lrms_done to signal that we're done.
echo "$exitcode $failure_reason" \
               >"$controldir"/$(basename "$grami" .grami).lrms_done

echo "----- exiting $progname -----" >>"$gmlog"
exit 0
