This is not meant to be used as is. You will need to uncomment and edit as needed.
# Grid Proxy Certificate and VOMS Attributes
# ==========================================
[gridproxy]
# The default VOMS to use. You can override this for specific probes by
# setting "voms" under the corresponding section.
#default_voms = ops
# Alternative 1: Use an externally generated proxy certificate. You can either
# export X509_USER_PROXY or point to it with
#user_proxy = /var/cache/nagios/gridproxy.pem
# Alternative 2: Let the probe generate a proxy certificate on demand from
# a robot certificate.
#user_cert = /etc/grid-security/robotcert.pem
#user_key = /etc/grid-security/robotkey.pem
# Checking Storage Elements
# =========================
[gridstorage]
# Base directory where to store temporary files and runtime state information.
#plugins_spooldir = /var/spool/nagios/plugins
# The ARC commands will store some files under $HOME/.arc/. Since the home
# directory may not be set to something usable, set an appropriate value here
# to instruct the Nagios plugins to override $HOME at startup.
#home_dir = /var/spool/nagios
# The log-level to use for this probe. In valid values in order of
# decreasing verbosity are DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL.
#loglevel = WARNING
# Checking Compute Elements: Information System
# =============================================
[arcinfosys]
# Same as for [gridstorage].
#plugins_spooldir = /var/spool/nagios/plugins
#home_dir = /var/spool/nagios
# The log-level for this probe as described under [gridstorage].
# It may be useful to set this to INFO.
#loglevel = WARNING
# The glue2 entry point
# ---------------------
#
# These are also provided as command-line options.
# Use this GLUE2 schema instead of querying the CE.
#glue2_schema =
# Warn if there are no objects of these classes.
#warn_if_missing = GLUE2AdminDomain,GLUE2Service,GLUE2Endpoint
# Report critical status if there are no object of these classes.
#critical_if_missing =
# A comma-separating list of foreign key attribute types which should be
# reflected in the DIT.
#hierarchical_foreign_keys =
# Require that all foreign keys which represent aggregation or composition
# are reflected in the DIT.
#hierarchical_aggregates =
# The aris and egiis entry points
# -------------------------------
#
# Use the command-line options.
# Example tests for the aris entry point
# --------------------------------------
# Usage: --cluster-test=cache_total
[arcinfosys.aris.cache_free]
type = limit
value = float(cache_free)/cache_total
critical.min = 0.01
warning.min = 0.1
# Usage: --cluster-test=topphys
[arcinfosys.aris.topphys]
type = regex
variable = runtimeenvironment
critical.pattern = APPS/HEP/ATLAS-TOPPHYS
critical.message = Missing TOPPHYS.
# Usage: --queue-test=queue-active
[arcinfosys.aris.queue-active]
type = regex
variable = status
critical.pattern = ^active$
critical.message = Inactive queue
# Checking Compute Elements: Job Submission
# =========================================
[arcce]
# Same as for [gridstorage].
#plugins_spooldir = /var/spool/nagios/plugins
#home_dir = /var/spool/nagios
# The log-level for this probe as described under [gridstorage].
#loglevel = WARNING
[arcce.connection_urls]
# This section can be used to force specific flavours and connection URLs for
# individual CEs. Each line takes the form
#
# ce.example.org = FLAVOUR:URL
#
# where the right hand side corresponds to the -c argument of arcsub(1).
# Example Scripted Job Tests
# --------------------------
#
# These checks are enabled by passing "--test NAME" to the submit command,
# where NAME is the section name without the "arcce." prefix. They injects
# pieces of shell to to the remote script and checks the output using
# regular expression patterns.
[arcce.python]
jobplugin = scripted
required_programs = python
script_line = python -V >python.out 2>&1
output_file = python.out
output_pattern = Python\s+(?P<version>\S+)
status_ok = Found Python version %(version)s.
status_critical = Python version not found in output.
service_description = ARCCE Python version
[arcce.perl]
jobplugin = scripted
required_programs = perl
script_line = perl -v >perl.out 2>&1
output_file = perl.out
output_pattern = This is perl, v(?P<version>\S+)
status_ok = Found Perl version %(version)s.
status_critical = Perl version not found in output.
service_description = ARCCE Perl version
[arcce.gcc]
jobplugin = scripted
required_programs = gcc
script_line = gcc -v >gcc.out 2>&1
output_file = gcc.out
output_pattern = gcc version (?P<version>\S+)
status_ok = Found GCC version %(version)s.
status_critical = GCC version not found in output.
service_description = ARCCE GCC version
[arcce.csh]
jobplugin = scripted
required_programs = csh
script_line = echo >csh-test.csh '#! /bin/csh'; echo >>csh-test.csh 'env >csh.out'; chmod +x csh-test.csh; ./csh-test.csh
output_file = csh.out
output_pattern = ^PATH=
status_ok = Found working csh.
status_critical = Did not find $PATH in csh environment.
service_description = ARCCE csh usability
# Example Storage Job Checks
# --------------------------
#
# These check are also enabled by passing the second componest of the
# section name to the --test option. This will add the specified staging to
# the job description. Input files will must exist in advance. Output
# files will be removed after checking that they exist.
[arcce.stage_srm]
jobplugin = staging
staged_inputs = srm://srm.example.org/somedir/testfile
staged_outputs = srm://srm.example.org/somedir/srm-%(hostname)s-%(epoch_time)s
service_description = ARCCE SRM Result
[arcce.stage_gridftp]
jobplugin = staging
staged_inputs = gsiftp://srm.example.org/somedir/testfile
staged_outputs = gsiftp://srm.example.org/somedir/gsiftp-%(hostname)s-%(epoch_time)s
service_description = ARCCE GridFTP Result
[arcce.stage_lfc]
jobplugin = staging
staged_inputs = lfc://lfc.example.org/lfcdir/testfile-lfc
staged_outputs = lfc://srm://srm.example.org/somedir/lfc-%(hostname)s-%(epoch_time)s@lfc.example.org/lfcdir/lfc-%(hostname)s-%(epoch_time)s
service_description = ARCCE LFC Result
This configuration is not meant to be used as is. It is an example which illustrates how to use the entry points of the check_arcce probe and define the associated passive services. Other probes are omitted here, as they are configured as independent services similar to commonly available Nagios probes.
# --------------------------------------------------------------------------
# This is an example Nagios configuration for the ARC-CE probes meant for
# documenation purposes. It cannot be used as-is.
# --------------------------------------------------------------------------
# Contacts and Contact Groups
# ===========================
# You probably already have contacts defined in your Nagios configuration, so
# you can skip these and substitute your own below.
define contactgroup {
contactgroup_name nagios-operators
members jdoe
}
define contact {
use generic-contact
contact_name jdoe
email jdoe@example.org
}
# Commands Definitions
# ====================
# This is a dummy command for passive services. You may already have something
# like it in your Nagios configuration.
define command {
command_name check_passive
command_line /bin/true
}
# This command monitors running jobs and collects those which have teminated,
# reporting passive results.
define command {
command_name check_arcce_monitor
command_line $USER1$/check_arcce -H $HOSTNAME$ monitor
}
# A job submission check including sub-tests which are defined in the plugin
# configuration in separate sections. The results of the sub-tests will be
# passively reported to the service names defined in the same configuration.
define command {
command_name check_arcce_submit
command_line $USER1$/check_arcce \
-H $HOSTNAME$ submit \
--test python --test perl --test csh --test gcc
}
# A job submission check with staging. The arguments to --stage-input options
# must exist. The arguments to --stage-output options will be overwritten, and
# deleted on termination. This command is not used below. To use it, add an
# active service and a passive service named "ARCCE SRM Job Termination".
define command {
command_name check_arcce_submit_staging
# Passed explicitly:
# command_line $USER1$/check_arcce \
# -H $HOSTNAME$ submit --job-tag srm \
# --termination-service 'ARCCE SRM Job Termination' \
# --stage-input srm.txt=srm://srm.example.org/nagios/readable.txt \
# --stage-output srm://srm.example.org/nagios/srm-$HOSTNAME$-$TIMET$.txt \
# Using a predefined job-test:
# command_line $USER1$/check_arcce \
# -H $HOSTNAME$ submit --job-tag srm \
# --termination-service 'ARCCE SRM Job Termination' \
# --test stage_srm
}
# Host Groups and Host Templates
# ==============================
# You need to have one host definitions to which the monitoring service is
# assigned to. This is typically the Nagios host itself, for which you
# probably already have a definition.
define host {
name nagios-host
use generic-host
max_check_attempts 10
contact_groups nagios-operators
register 0
}
# The following host group and template will be used for all CEs.
define hostgroup {
hostgroup_name arcce-hosts
alias ARCCE Hosts
}
define host {
name arcce-host
use generic-host
max_check_attempts 10
contact_groups nagios-operators
hostgroups arcce-hosts
register 0
}
# Service Groups and Service Templates
# ====================================
define servicegroup {
servicegroup_name arcce-services
alias ARCCE Services
}
define service {
name arcce-service
use generic-service
servicegroups arcce-services
check_period 24x7
max_check_attempts 3
flap_detection_enabled 0
contact_groups nagios-operators
notifications_enabled 0
register 0
}
define service {
name arcce-monitoring-service
use arcce-service
normal_check_interval 5
retry_check_interval 5
register 0
}
define service {
name arcce-submission-service
use arcce-service
normal_check_interval 30
retry_check_interval 30
register 0
}
define service {
name arcce-passive-service
use arcce-service
active_checks_enabled 0
passive_checks_enabled 1
check_command check_passive
register 0
}
define service {
use arcce-monitoring-service
host_name localhost
service_description ARCCE Monitoring
check_command check_arcce_monitor
}
# For each ARC CE, we need one active service for submission and a number of
# passive services to collect the results. In the following we associate the
# per-CE services to the "arcce-hosts" group, which will add them to all
# members of the group.
define service {
use arcce-submission-service
service_description ARCCE Job Submission
hostgroup_name arcce-hosts
check_command check_arcce_submit
}
define service {
use arcce-passive-service
service_description ARCCE Job Termination
hostgroup_name arcce-hosts
}
define service {
use arcce-passive-service
service_description ARCCE Python version
hostgroup_name arcce-hosts
}
define service {
use arcce-passive-service
service_description ARCCE Perl version
hostgroup_name arcce-hosts
}
define service {
use arcce-passive-service
service_description ARCCE GCC version
hostgroup_name arcce-hosts
}
define service {
use arcce-passive-service
service_description ARCCE csh usability
hostgroup_name arcce-hosts
}
# Hosts
# =====
# This provides the monitoring service.
define host {
use nagios-host
host_name localhost
}
# Any host which use the arcce-host template will get an active submission
# service, and all the related passive services.
#define host {
# use arcce-host
# host_name ce-00.example.org
#}