diff --git a/AUTHORS b/AUTHORS index d05b86f05f..fae7bfb5e5 100644 --- a/AUTHORS +++ b/AUTHORS @@ -248,6 +248,10 @@ Oleg King Ondrej Zajicek - madwifi plugin. +Pablo Llopis + - Slurm plugin + - RestoreAffinityPolicy in turbostat plugin + Patrik Weiskircher - Contextswitch plugin. - Forkrate counter in the processes plugin. diff --git a/Makefile.am b/Makefile.am index 85f8da8a73..3620423f8e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1697,6 +1697,14 @@ sigrok_la_LDFLAGS = $(PLUGIN_LDFLAGS) sigrok_la_LIBADD = $(LIBSIGROK_LIBS) endif +if BUILD_PLUGIN_SLURM +pkglib_LTLIBRARIES += slurm.la +slurm_la_SOURCES = src/slurm.c +slurm_la_CFLAGS = $(AM_CFLAGS) $(BUILD_WITH_LIBSLURM_CFLAGS) +slurm_la_LDFLAGS = $(PLUGIN_LDFLAGS) +slurm_la_LIBADD = $(BUILD_WITH_LIBSLURM_LIBS) +endif + if BUILD_PLUGIN_SMART if BUILD_WITH_LIBUDEV pkglib_LTLIBRARIES += smart.la diff --git a/README b/README index f28d49922f..98890434eb 100644 --- a/README +++ b/README @@ -378,6 +378,10 @@ Features to have its measurements fed to collectd. This includes multimeters, sound level meters, thermometers, and much more. + - slurm + Gathers per-partition node and job state information using libslurm, + as well as internal health statistics. + - smart Collect SMART statistics, notably load cycle count, temperature and bad sectors. @@ -965,6 +969,10 @@ Prerequisites libzip, and optionally (depending on which drivers are enabled) on libusb, libftdi and libudev. + * libslurm (optional) + Used by the `slurm` plugin. + + * libstatgrab (optional) Used by various plugins to collect statistics on systems other than Linux and/or Solaris. diff --git a/configure.ac b/configure.ac index c95422f4e1..443c0bcb5f 100644 --- a/configure.ac +++ b/configure.ac @@ -6352,6 +6352,83 @@ AC_DEFUN( ] )# AC_PLUGIN(name, default, info) +# --with-libslurm {{{ +AC_ARG_WITH([libslurm], + [AS_HELP_STRING([--with-libslurm@<:@=PREFIX@:>@], [Path to the libslurm library.])], + [ + if test "x$withval" = "xno"; then + with_libslurm="no" + else if test "x$withval" = "xyes"; then + with_libslurm="use_pkgconfig" + else if test -d "$with_libslurm/lib"; then + AC_MSG_NOTICE([Not checking for libslurm: Manually configured]) + with_libslurm_cflags="-I$withval/include" + with_libslurm_libs="-L$withval/lib -llibslurm" + with_libslurm="yes" + fi; fi; fi + ], + [with_libslurm="use_pkgconfig"] +) + +# configure using pkg-config +if test "x$with_libslurm" = "xuse_pkgconfig"; then + AC_MSG_NOTICE([Checking for libslurm using $PKG_CONFIG]) + $PKG_CONFIG --exists 'slurm' 2>/dev/null + if test $? -ne 0; then + with_libslurm="no (pkg-config doesn't know libslurm)" + fi +fi + +if test "x$with_libslurm" = "xuse_pkgconfig"; then + with_libslurm_cflags="`$PKG_CONFIG --cflags 'slurm'`" + if test $? -ne 0; then + with_libslurm="no ($PKG_CONFIG failed)" + fi + + with_libslurm_libs="`$PKG_CONFIG --libs 'slurm'`" + if test $? -ne 0; then + with_libslurm="no ($PKG_CONFIG failed)" + fi +fi + +if test "x$with_libslurm" = "xuse_pkgconfig"; then + with_libslurm="yes" +fi + +if test "x$with_libslurm" = "xyes"; then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $with_libslurm_cflags" + + AC_CHECK_HEADERS([slurm/slurm.h], + [with_libslurm="yes"], + [with_libslurm="no (slurm/slurm.h not found)"] + ) + + CPPFLAGS="$SAVE_CPPFLAGS" +fi + +if test "x$with_libslurm" = "xyes"; then + SAVE_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS $with_libslurm_libs" + + AC_CHECK_LIB([slurm], [slurm_load_jobs], + [with_libslurm="yes"], + [with_libslurm="no (symbol slurm_load_jobs not found)"] + ) + + LDFLAGS="$SAVE_LDFLAGS" +fi + +if test "x$with_libslurm" = "xyes"; then + BUILD_WITH_LIBSLURM_CFLAGS="$with_libslurm_cflags" + BUILD_WITH_LIBSLURM_LIBS="$with_libslurm_libs" +fi + +AC_SUBST([BUILD_WITH_LIBSLURM_CFLAGS]) +AC_SUBST([BUILD_WITH_LIBSLURM_LIBS]) +# }}} + + m4_divert_once([HELP_ENABLE], [ collectd features:]) # FIXME: Remove these calls to `AC_COLLECTD' and then remove that macro. @@ -6890,6 +6967,7 @@ AC_PLUGIN([rrdtool], [$with_librrd], [RRDTool output pl AC_PLUGIN([sensors], [$with_libsensors], [lm_sensors statistics]) AC_PLUGIN([serial], [$plugin_serial], [serial port traffic]) AC_PLUGIN([sigrok], [$with_libsigrok], [sigrok acquisition sources]) +AC_PLUGIN([slurm], [$with_libslurm], [SLURM jobs and nodes status]) AC_PLUGIN([smart], [$plugin_smart], [SMART statistics]) AC_PLUGIN([snmp], [$with_libnetsnmp], [SNMP querying plugin]) AC_PLUGIN([snmp_agent], [$with_libnetsnmpagent], [SNMP agent plugin]) @@ -7184,6 +7262,7 @@ AC_MSG_RESULT([ librrd . . . . . . . $with_librrd]) AC_MSG_RESULT([ libsensors . . . . . $with_libsensors]) AC_MSG_RESULT([ libsigrok . . . . . $with_libsigrok]) AC_MSG_RESULT([ libssl . . . . . . . $with_libssl]) +AC_MSG_RESULT([ libslurm . . . . . . $with_libslurm]) AC_MSG_RESULT([ libstatgrab . . . . . $with_libstatgrab]) AC_MSG_RESULT([ libtokyotyrant . . . $with_libtokyotyrant]) AC_MSG_RESULT([ libudev . . . . . . . $with_libudev]) @@ -7316,6 +7395,7 @@ AC_MSG_RESULT([ rrdtool . . . . . . . $enable_rrdtool]) AC_MSG_RESULT([ sensors . . . . . . . $enable_sensors]) AC_MSG_RESULT([ serial . . . . . . . $enable_serial]) AC_MSG_RESULT([ sigrok . . . . . . . $enable_sigrok]) +AC_MSG_RESULT([ slurm . . . . . . . . $enable_slurm]) AC_MSG_RESULT([ smart . . . . . . . . $enable_smart]) AC_MSG_RESULT([ snmp . . . . . . . . $enable_snmp]) AC_MSG_RESULT([ snmp_agent . . . . . $enable_snmp_agent]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 7f09c5cc2e..ee930e3a2f 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -189,6 +189,7 @@ #@BUILD_PLUGIN_SENSORS_TRUE@LoadPlugin sensors #@BUILD_PLUGIN_SERIAL_TRUE@LoadPlugin serial #@BUILD_PLUGIN_SIGROK_TRUE@LoadPlugin sigrok +#@BUILD_PLUGIN_SLURM_TRUE@LoadPlugin slurm #@BUILD_PLUGIN_SMART_TRUE@LoadPlugin smart #@BUILD_PLUGIN_SNMP_TRUE@LoadPlugin snmp #@BUILD_PLUGIN_SNMP_AGENT_TRUE@LoadPlugin snmp_agent diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 71931c2bd8..1ae0650fec 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -7902,6 +7902,16 @@ measurements are discarded. =back +=head2 Plugin C + +This plugin collects per-partition SLURM node and job state information, as +well as internal health statistics. +It takes no options. It should run on a node that is capable of running the +I and I commands, i.e. it has a running slurmd and a valid +slurm.conf. +Note that this plugin needs the B option set to I in order to +function properly. + =head2 Plugin C The C plugin collects SMART information from physical diff --git a/src/slurm.c b/src/slurm.c new file mode 100644 index 0000000000..72e12a9835 --- /dev/null +++ b/src/slurm.c @@ -0,0 +1,610 @@ +/** + * collectd - src/slurm.c + * Copyright (C) 2018 Pablo Llopis + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; only version 2 of the License is applicable. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: + * Pablo Llopis + **/ + +#define _DEFAULT_SOURCE +#define _BSD_SOURCE + +#include "collectd.h" + +#include "plugin.h" +#include "utils/common/common.h" + +#include +#include +#include +#include +#include +#include + +#define PLUGIN_NAME "slurm" +#define PART_NAME_SIZE 128 + +/* this function declaration is missing in slurm.h */ +extern void slurm_free_stats_response_msg(stats_info_response_msg_t *msg); + +enum slurm_node_states { + MAINT_NONRESP, + MAINT, + REBOOT_NONRESP, + REBOOT, + DRAINING_MAINT, + DRAINING_REBOOT, + DRAINING_POWERUP, + DRAINING_POWERDOWN, + DRAINING_NONRESP, + DRAINING, + DRAINED_MAINT, + DRAINED_REBOOT, + DRAINED_POWERUP, + DRAINED_POWERDOWN, + DRAINED_NONRESP, + DRAINED, + FAILING_NONRESP, + FAILING, + FAIL_NONRESP, + FAIL, + CANCEL_REBOOT, + POWER_DOWN, + POWER_UP, + DOWN_MAINT, + DOWN_REBOOT, + DOWN_POWERUP, + DOWN_POWERDOWN, + DOWN_NONRESP, + DOWN, + ALLOCATED_MAINT, + ALLOCATED_REBOOT, + ALLOCATED_POWERUP, + ALLOCATED_POWERDOWN, + ALLOCATED_NONRESP, + ALLOCATED_COMP, + ALLOCATED, + COMPLETING_MAINT, + COMPLETING_REBOOT, + COMPLETING_POWERUP, + COMPLETING_POWERDOWN, + COMPLETING_NONRESP, + COMPLETING, + IDLE_MAINT, + IDLE_REBOOT, + IDLE_POWERUP, + IDLE_POWERDOWN, + IDLE_NONRESP, + PERFCTRS, + RESERVED, + IDLE, + MIXED_MAINT, + MIXED_REBOOT, + MIXED_POWERUP, + MIXED_POWERDOWN, + MIXED_NONRESP, + MIXED, + FUTURE_MAINT, + FUTURE_REBOOT, + FUTURE_POWERUP, + FUTURE_POWERDOWN, + FUTURE_NONRESP, + FUTURE, + RESUME, + UNKNOWN_NONRESP, + UNKNOWN, + UNKNOWN2 +}; + +char *node_state_names[] = {"MAINT_NONRESP", + "MAINT", + "REBOOT_NONRESP", + "REBOOT", + "DRAINING_MAINT", + "DRAINING_REBOOT", + "DRAINING_POWERUP", + "DRAINING_POWERDOWN", + "DRAINING_NONRESP", + "DRAINING", + "DRAINED_MAINT", + "DRAINED_REBOOT", + "DRAINED_POWERUP", + "DRAINED_POWERDOWN", + "DRAINED_NONRESP", + "DRAINED", + "FAILING_NONRESP", + "FAILING", + "FAIL_NONRESP", + "FAIL", + "CANCEL_REBOOT", + "POWER_DOWN", + "POWER_UP", + "DOWN_MAINT", + "DOWN_REBOOT", + "DOWN_POWERUP", + "DOWN_POWERDOWN", + "DOWN_NONRESP", + "DOWN", + "ALLOCATED_MAINT", + "ALLOCATED_REBOOT", + "ALLOCATED_POWERUP", + "ALLOCATED_POWERDOWN", + "ALLOCATED_NONRESP", + "ALLOCATED_COMP", + "ALLOCATED", + "COMPLETING_MAINT", + "COMPLETING_REBOOT", + "COMPLETING_POWERUP", + "COMPLETING_POWERDOWN", + "COMPLETING_NONRESP", + "COMPLETING", + "IDLE_MAINT", + "IDLE_REBOOT", + "IDLE_POWERUP", + "IDLE_POWERDOWN", + "IDLE_NONRESP", + "PERFCTRS", + "RESERVED", + "IDLE", + "MIXED_MAINT", + "MIXED_REBOOT", + "MIXED_POWERUP", + "MIXED_POWERDOWN", + "MIXED_NONRESP", + "MIXED", + "FUTURE_MAINT", + "FUTURE_REBOOT", + "FUTURE_POWERUP", + "FUTURE_POWERDOWN", + "FUTURE_NONRESP", + "FUTURE", + "RESUME", + "UNKNOWN_NONRESP", + "UNKNOWN", + "?"}; + +/* based on src/common/slurm_protocol_defs.c node_state_string function */ +uint8_t slurm_node_state(uint32_t inx) { + int base = (inx & NODE_STATE_BASE); + bool comp_flag = (inx & NODE_STATE_COMPLETING); + bool drain_flag = (inx & NODE_STATE_DRAIN); + bool fail_flag = (inx & NODE_STATE_FAIL); + bool maint_flag = (inx & NODE_STATE_MAINT); + bool net_flag = (inx & NODE_STATE_NET); + bool reboot_flag = (inx & NODE_STATE_REBOOT); + bool res_flag = (inx & NODE_STATE_RES); + bool resume_flag = (inx & NODE_RESUME); + bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); + bool power_down_flag = (inx & NODE_STATE_POWER_SAVE); + bool power_up_flag = (inx & NODE_STATE_POWER_UP); + + if (maint_flag) { + if (drain_flag || (base == NODE_STATE_ALLOCATED) || + (base == NODE_STATE_DOWN) || (base == NODE_STATE_MIXED)) + ; + else if (no_resp_flag) + return MAINT_NONRESP; + else + return MAINT; + } + if (reboot_flag) { + if ((base == NODE_STATE_ALLOCATED) || (base == NODE_STATE_MIXED)) + ; + else if (no_resp_flag) + return REBOOT_NONRESP; + else + return REBOOT; + } + if (drain_flag) { + if (comp_flag || (base == NODE_STATE_ALLOCATED) || + (base == NODE_STATE_MIXED)) { + if (maint_flag) + return DRAINING_MAINT; + if (reboot_flag) + return DRAINING_REBOOT; + if (power_up_flag) + return DRAINING_POWERUP; + if (power_down_flag) + return DRAINING_POWERDOWN; + if (no_resp_flag) + return DRAINING_NONRESP; + return DRAINING; + } else { + if (maint_flag) + return DRAINED_MAINT; + if (reboot_flag) + return DRAINED_REBOOT; + if (power_up_flag) + return DRAINED_POWERUP; + if (power_down_flag) + return DRAINED_POWERDOWN; + if (no_resp_flag) + return DRAINED_NONRESP; + return DRAINED; + } + } + if (fail_flag) { + if (comp_flag || (base == NODE_STATE_ALLOCATED)) { + if (no_resp_flag) + return FAILING_NONRESP; + return FAILING; + } else { + if (no_resp_flag) + return FAIL_NONRESP; + return FAIL; + } + } + + if (inx == NODE_STATE_CANCEL_REBOOT) + return CANCEL_REBOOT; + if (inx == NODE_STATE_POWER_SAVE) + return POWER_DOWN; + if (inx == NODE_STATE_POWER_UP) + return POWER_UP; + if (base == NODE_STATE_DOWN) { + if (maint_flag) + return DOWN_MAINT; + if (reboot_flag) + return DOWN_REBOOT; + if (power_up_flag) + return DOWN_POWERUP; + if (power_down_flag) + return DOWN_POWERDOWN; + if (no_resp_flag) + return DOWN_NONRESP; + return DOWN; + } + + if (base == NODE_STATE_ALLOCATED) { + if (maint_flag) + return ALLOCATED_MAINT; + if (reboot_flag) + return ALLOCATED_REBOOT; + if (power_up_flag) + return ALLOCATED_POWERUP; + if (power_down_flag) + return ALLOCATED_POWERDOWN; + if (no_resp_flag) + return ALLOCATED_NONRESP; + if (comp_flag) + return ALLOCATED_COMP; + return ALLOCATED; + } + if (comp_flag) { + if (maint_flag) + return COMPLETING_MAINT; + if (reboot_flag) + return COMPLETING_REBOOT; + if (power_up_flag) + return COMPLETING_POWERUP; + if (power_down_flag) + return COMPLETING_POWERDOWN; + if (no_resp_flag) + return COMPLETING_NONRESP; + return COMPLETING; + } + if (base == NODE_STATE_IDLE) { + if (maint_flag) + return IDLE_MAINT; + if (reboot_flag) + return IDLE_REBOOT; + if (power_up_flag) + return IDLE_POWERUP; + if (power_down_flag) + return IDLE_POWERDOWN; + if (no_resp_flag) + return IDLE_NONRESP; + if (net_flag) + return PERFCTRS; + if (res_flag) + return RESERVED; + return IDLE; + } + if (base == NODE_STATE_MIXED) { + if (maint_flag) + return MIXED_MAINT; + if (reboot_flag) + return MIXED_REBOOT; + if (power_up_flag) + return MIXED_POWERUP; + if (power_down_flag) + return MIXED_POWERDOWN; + if (no_resp_flag) + return MIXED_NONRESP; + return MIXED; + } + if (base == NODE_STATE_FUTURE) { + if (maint_flag) + return FUTURE_MAINT; + if (reboot_flag) + return FUTURE_REBOOT; + if (power_up_flag) + return FUTURE_POWERUP; + if (power_down_flag) + return FUTURE_POWERDOWN; + if (no_resp_flag) + return FUTURE_NONRESP; + return FUTURE; + } + if (resume_flag) + return RESUME; + if (base == NODE_STATE_UNKNOWN) { + if (no_resp_flag) + return UNKNOWN_NONRESP; + return UNKNOWN; + } + return UNKNOWN2; +} + +#define NUM_NODE_STATES (sizeof(node_state_names) / sizeof(node_state_names[0])) + +typedef struct partition_state_st { + char name[PART_NAME_SIZE]; + uint32_t nodes_states_count[NUM_NODE_STATES]; + /* counts jobs states indexed by enum job_states in slurm.h */ + uint32_t jobs_states_count[JOB_END]; +} partition_state_t; + +/* based on enum job_states from slurm.h */ +static const char *job_state_names[] = { + "pending", "running", "suspended", "complete", "cancelled", "failed", + "timeout", "node_fail", "preempted", "boot_fail", "deadline", "oom", +}; + +static partition_state_t *alloc_partition_states(uint32_t num_partitions, + partition_info_t *partitions) { + partition_state_t *partition_states; + + partition_states = + (partition_state_t *)calloc(num_partitions, sizeof(partition_state_t)); + if (!partition_states) { + return NULL; + } + + for (int i = 0; i < num_partitions; i++) + sstrncpy(partition_states[i].name, partitions[i].name, PART_NAME_SIZE); + + return partition_states; +} + +static partition_state_t *find_partition(partition_state_t *partitions, + uint32_t num_partitions, char *name) { + partition_state_t *part = NULL; + + for (int i = 0; i < num_partitions; i++) { + if (strncmp(name, partitions[i].name, PART_NAME_SIZE) == 0) + part = &partitions[i]; + } + + return part; +} + +static void slurm_submit_gauge(const char *plugin_instance, const char *type, + const char *type_instance, gauge_t value) { + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &(value_t){.gauge = value}; + vl.values_len = 1; + sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); + if (plugin_instance != NULL) + sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); + sstrncpy(vl.type, type, sizeof(vl.type)); + if (type_instance != NULL) + sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); + + plugin_dispatch_values(&vl); +} + +static void slurm_submit_derive(const char *plugin_instance, const char *type, + const char *type_instance, derive_t value) { + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &(value_t){.derive = value}; + vl.values_len = 1; + sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin)); + if (plugin_instance != NULL) + sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance)); + sstrncpy(vl.type, type, sizeof(vl.type)); + if (type_instance != NULL) + sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance)); + + plugin_dispatch_values(&vl); +} + +static void slurm_submit_partition(partition_state_t *partition) { + for (int i = 0; i < JOB_END; i++) { + slurm_submit_gauge(partition->name, "slurm_job_state", job_state_names[i], + partition->jobs_states_count[i]); + } + for (int i = 0; i < NUM_NODE_STATES; i++) { + slurm_submit_gauge(partition->name, "slurm_node_state", node_state_names[i], + partition->nodes_states_count[i]); + } +} + +static void slurm_submit_stats(stats_info_response_msg_t *stats_resp) { + // slurm load stats + slurm_submit_gauge("slurm_load_stats", "threads", "server_thread_count", + stats_resp->server_thread_count); + slurm_submit_gauge("slurm_load_stats", "threads", "agent_thread_count", + stats_resp->agent_count); + slurm_submit_gauge("slurm_load_stats", "queue_length", "agent_queue_size", + stats_resp->agent_queue_size); + slurm_submit_gauge("slurm_load_stats", "queue_length", "dbd_agent_queue_size", + stats_resp->dbd_agent_queue_size); + + // slurm scheduler stats + slurm_submit_derive("slurm_sched_stats", "slurm_cycles", "schedule_cycles", + stats_resp->schedule_cycle_counter); + slurm_submit_gauge("slurm_sched_stats", "slurm_cycle_last", + "schedule_cycle_last", stats_resp->schedule_cycle_last); + slurm_submit_derive("slurm_sched_stats", "slurm_cycle_duration", + "schedule_cycle_duration", + stats_resp->schedule_cycle_sum); + slurm_submit_derive("slurm_sched_stats", "slurm_cycle_depth", + "schedule_cycle_depth", stats_resp->schedule_cycle_depth); + slurm_submit_gauge("slurm_sched_stats", "queue_length", + "schedule_queue_length", stats_resp->schedule_queue_len); + + // slurm job stats + slurm_submit_derive("slurm_jobs_stats", "slurm_job_stats", "submitted", + stats_resp->jobs_submitted); + slurm_submit_derive("slurm_jobs_stats", "slurm_job_stats", "started", + stats_resp->jobs_started); + slurm_submit_derive("slurm_jobs_stats", "slurm_job_stats", "completed", + stats_resp->jobs_completed); + slurm_submit_derive("slurm_jobs_stats", "slurm_job_stats", "canceled", + stats_resp->jobs_canceled); + slurm_submit_derive("slurm_jobs_stats", "slurm_job_stats", "failed", + stats_resp->jobs_failed); + + // slurm backfill stats + slurm_submit_derive("slurm_backfill_stats", "slurm_backfilled_jobs", + "backfilled_jobs", stats_resp->bf_backfilled_jobs); + slurm_submit_derive("slurm_backfill_stats", "slurm_backfilled_jobs", + "backfilled_pack_jobs", + stats_resp->bf_backfilled_pack_jobs); + slurm_submit_derive("slurm_backfill_stats", "slurm_cycles", "backfill_cycles", + stats_resp->bf_cycle_counter); + slurm_submit_gauge("slurm_backfill_stats", "slurm_cycle_last", + "last_backfill_cycle", stats_resp->bf_cycle_last); + slurm_submit_derive("slurm_backfill_stats", "slurm_cycle_duration", + "backfill_cycle_duration", stats_resp->bf_cycle_sum); + slurm_submit_gauge("slurm_backfill_stats", "slurm_last_cycle_depth", + "backfill_last_cycle_depth", stats_resp->bf_last_depth); + slurm_submit_gauge("slurm_backfill_stats", "slurm_last_cycle_depth", + "backfill_last_cycle_depth_try", + stats_resp->bf_last_depth_try); + slurm_submit_derive("slurm_backfill_stats", "slurm_cycle_depth", + "backfill_cycle_depth", stats_resp->bf_depth_sum); + slurm_submit_derive("slurm_backfill_stats", "slurm_cycle_depth", + "backfill_cycle_depth_try", stats_resp->bf_depth_try_sum); + slurm_submit_gauge("slurm_backfill_stats", "queue_length", + "backfill_last_queue_length", stats_resp->bf_queue_len); + slurm_submit_derive("slurm_backfill_stats", "slurm_queue_length", + "backfill_queue_length", stats_resp->bf_queue_len_sum); +} + +static int slurm_read(void) { + job_info_msg_t *job_buffer_ptr = NULL; + job_info_t *job_ptr; + partition_info_msg_t *part_buffer_ptr = NULL; + partition_info_t *part_ptr; + partition_state_t *partition_states; + partition_state_t *partition_state; + node_info_msg_t *node_buffer_ptr = NULL; + node_info_t *node_ptr; + stats_info_response_msg_t *stats_resp; + stats_info_request_msg_t stats_req; + + if (slurm_load_jobs((time_t)NULL, &job_buffer_ptr, SHOW_ALL)) { + ERROR(PLUGIN_NAME ": slurm_load_jobs error"); + return -1; + } + + if (slurm_load_node((time_t)NULL, &node_buffer_ptr, SHOW_ALL)) { + slurm_free_job_info_msg(job_buffer_ptr); + ERROR(PLUGIN_NAME ": slurm_load_node error"); + return -1; + } + + if (slurm_load_partitions((time_t)NULL, &part_buffer_ptr, 0)) { + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + ERROR(PLUGIN_NAME ": slurm_load_partitions error"); + return -1; + } + + stats_req.command_id = STAT_COMMAND_GET; + if (slurm_get_statistics(&stats_resp, &stats_req)) { + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + slurm_free_partition_info_msg(part_buffer_ptr); + ERROR(PLUGIN_NAME ": slurm_get_statistics error"); + } + + /* SLURM APIs provide *non-relational* data about nodes, partitions and jobs. + * We allocate a data structure that relates all three together, and the + * following + * two for loops fill this data structure. The data structure is an array + * of partition_state_t that holds job and node states. */ + uint32_t num_partitions = part_buffer_ptr->record_count; + partition_states = + alloc_partition_states(num_partitions, part_buffer_ptr->partition_array); + if (!partition_states) { + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + slurm_free_partition_info_msg(part_buffer_ptr); + ERROR(PLUGIN_NAME ": alloc_partition_states"); + return -1; + } + + /* fill partition_states array with per-partition job state information */ + for (int i = 0; i < job_buffer_ptr->record_count; i++) { + job_ptr = &job_buffer_ptr->job_array[i]; + partition_state = + find_partition(partition_states, num_partitions, job_ptr->partition); + if (!partition_state) { + ERROR(PLUGIN_NAME ": slurm_read: cannot find partition %s from jobid %d" + " in partition list returned by slurm_load_partitions", + job_ptr->partition, job_ptr->job_id); + continue; + } + + uint8_t job_state = job_ptr->job_state & JOB_STATE_BASE; + partition_state->jobs_states_count[job_state]++; + } + + /* fill partition_states array with per-partition node state information */ + for (int i = 0; i < part_buffer_ptr->record_count; i++) { + part_ptr = &part_buffer_ptr->partition_array[i]; + + partition_state = + find_partition(partition_states, num_partitions, part_ptr->name); + if (!partition_state) { + ERROR(PLUGIN_NAME ": slurm_read: cannot find partition %s" + " in partition list returned by slurm_load_partitions", + part_ptr->name); + continue; + } + + for (int j = 0; part_ptr->node_inx; j += 2) { + if (part_ptr->node_inx[j] == -1) + break; + for (int k = part_ptr->node_inx[j]; k <= part_ptr->node_inx[j + 1]; k++) { + node_ptr = &node_buffer_ptr->node_array[k]; + /* some non-existant nodes (name is NULL) may show up as node_state + * FUTURE */ + uint8_t node_state = slurm_node_state(node_ptr->node_state); + partition_state->nodes_states_count[node_state]++; + } + } + } + + for (int i = 0; i < num_partitions; i++) + slurm_submit_partition(&partition_states[i]); + + slurm_submit_stats(stats_resp); + + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + slurm_free_partition_info_msg(part_buffer_ptr); + slurm_free_stats_response_msg(stats_resp); + free(partition_states); + return 0; +} + +void module_register(void) { plugin_register_read("slurm", slurm_read); } diff --git a/src/types.db b/src/types.db index 69f59b0659..efbfb7f453 100644 --- a/src/types.db +++ b/src/types.db @@ -231,6 +231,16 @@ serial_octets rx:DERIVE:0:U, tx:DERIVE:0:U signal_noise value:GAUGE:U:0 signal_power value:GAUGE:U:0 signal_quality value:GAUGE:0:U +slurm_job_state value:GAUGE:0:U +slurm_node_state value:GAUGE:0:U +slurm_backfilled_jobs value:DERIVE:0:U +slurm_cycles value:DERIVE:0:U +slurm_cycle_last value:GAUGE:0:U +slurm_cycle_duration value:DERIVE:0:U +slurm_last_cycle_depth value:GAUGE:0:U +slurm_cycle_depth value:DERIVE:0:U +slurm_job_stats value:DERIVE:0:U +slurm_queue_length value:DERIVE:0:U smart_attribute current:GAUGE:0:255, worst:GAUGE:0:255, threshold:GAUGE:0:255, pretty:GAUGE:0:U smart_badsectors value:GAUGE:0:U smart_powercycles value:GAUGE:0:U