8000 First Invasive check through external K8s Job - dcgmi by cmisale · Pull Request #8 · IBM/autopilot · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

First Invasive check through external K8s Job - dcgmi #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ image-build:

image-push:
@docker push ${IMAGE}:v${TAG}

all: image-build image-push
3 changes: 2 additions & 1 deletion autopilot-daemon/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ COPY gpu-mem/gpucheck.cu .

RUN nvcc -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 gpucheck.cu -o gpucheck -lcublas --linker-options -lnvidia-ml -O3

FROM golang:1.19 AS gobuild
FROM golang:1.21 AS gobuild

ENV GOOS=linux
ENV GOARCH=amd64
Expand Down Expand Up @@ -98,4 +98,5 @@ COPY gpu-power/power-throttle.sh /home/autopilot/gpu-power/power-throttle.sh
RUN pip install --upgrade pip && pip install kubernetes netifaces aiohttp[speedups]
RUN apt -y update && apt install -y vim && apt -y clean && apt -y autoremove
RUN chmod 755 /usr/local/bin/autopilot && chown -hR autopilot /home/autopilot && chmod -R g=u /home/autopilot
RUN chmod 777 /tmp
CMD ["/usr/local/bin/autopilot"]
46 changes: 41 additions & 5 deletions autopilot-daemon/go.mod
Original file line number Diff line number Diff line change
@@ -1,21 +1,57 @@
module github.com/IBM/autopilot

go 1.19
go 1.21

toolchain go1.21.1

require (
github.com/prometheus/client_golang v1.15.0
k8s.io/klog/v2 v2.100.1
github.com/thanhpk/randstr v1.0.6
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
k8s.io/client-go v0.29.2
k8s.io/klog/v2 v2.110.1
k8s.io/kubectl v0.29.2
)

require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/go-logr/logr v1.2.0 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/go-logr/logr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
golang.org/x/sys v0.6.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/oauth2 v0.10.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/term v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
)
155 changes: 148 additions & 7 deletions autopilot-daemon/go.sum

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions autopilot-daemon/gpu-bw/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


def main():

parser = argparse.ArgumentParser()
parser.add_argument('-t', '--threshold', type=str, default='4')
args = parser.parse_args()
Expand All @@ -16,9 +16,7 @@ def main():
output = os.popen('./gpu-bw/gpuLocalBandwidthTest.sh -t ' + args.threshold)
result = output.read()

if "FAIL" in result:
print("[[ PCIEBW ]] FAIL")
elif "ABORT" in result:
if "ABORT" in result or "SKIP" in result:
print("[[ PCIEBW ]] ABORT")
print(result)
exit()
Expand Down
4 changes: 2 additions & 2 deletions autopilot-daemon/gpu-bw/gpuLocalBandwidthTest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ fi

D=$((D-1))
for i in $(seq 0 1 $D) ; do
EXEC+="$($PROG --htod --memory=pinned --device=$i --csv)"
EXEC+="$($PROG --htod --memory=pinned --device=$i --csv 2>&1)"
EXEC+="\n"
done
errors="$(echo ${EXEC} | grep -i '802\|error')"
if [[ -n $errors ]]; then
echo "CRITICAL ERROR WITH GPUs - DEVICE NOT READY"
echo "CRITICAL ERROR WITH GPUs"
echo "ABORT"
echo -e $EXEC
else
Expand Down
63 changes: 48 additions & 15 deletions autopilot-daemon/gpu-dcgm/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@
import subprocess
import os
import argparse
import datetime
from kubernetes import client, config
from kubernetes.client.rest import ApiException

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--run', type=str, default='1')
args = parser.parse_args()
config.load_incluster_config()
v1 = client.CoreV1Api()
nodename = os.getenv("NODE_NAME")

parser = argparse.ArgumentParser()
parser.add_argument('-r', '--run', type=str, default='1')
parser.add_argument('-l', '--label_node', action='store_true')
args = parser.parse_args()
def main():
output = os.popen('bash ./utils/briefings.sh')
result = output.read()
print(result)
Expand All @@ -21,9 +28,9 @@ def main():
print(result)

def try_dcgm(command):
try:
result = subprocess.run(command, check=True, text=True, capture_output=True)
except subprocess.CalledProcessError:
result = subprocess.run(command, text=True, capture_output=True)
return_code = result.returncode # 0 for success
if return_code != 0:
print("[[ DCGM ]] DCGM process terminated with errors. Other processes might be running on GPUs. ABORT")
command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv']
try:
Expand All @@ -33,12 +40,10 @@ def try_dcgm(command):
exit()
if proc.stdout:
print("[[ DCGM ]] GPUs currently utilized:\n", proc.stdout)
exit()


if result.stderr:
F438 print(result.stderr)
print("[[ DCGM ]] exited with error: " + result.stderr + " ERR")
exit()
else:
dcgm_dict = json.loads(result.stdout)
tests_dict = dcgm_dict['DCGM GPU Diagnostic']['test_categories']
Expand All @@ -47,15 +52,43 @@ def try_dcgm(command):
for category in tests_dict:
for test in category['tests']:
if test['results'][0]['status'] == 'Fail':
print(test['name'], ":", test['results'][0]['status'])
success = False
output+=(test['name']+" ")
print(test['name'], ":", test['results'][0]['status'])
if test['name'] == "GPU Memory":
output+=(test['name'].replace(" ","")+"_")
for entry in test['results']:
output+=("."+entry['gpu_id'])
if success:
print("[[ DCGM ]] SUCCESS")
else:
print("Host ", os.getenv("NODE_NAME"))
print("Host", nodename)
print("[[ DCGM ]] FAIL")
print(output.strip())
if args.label_node:
patch_node(success, output)


def patch_node(success, output):
now = datetime.datetime.now(datetime.timezone.utc)
# ADD UTC
timestamp = now.strftime("%Y-%m-%d_%H.%M.%SUTC")
result = ""
if success:
result = "PASS_"+timestamp
else:
result = "ERR_"+timestamp+"_"+output

label = {
"metadata": {
"labels": {
"autopilot/dcgm.level.3": result}
}
}
print("label: ", result)
try:
api_response = v1.patch_node(nodename, label)
except ApiException as e:
print("Exception when calling corev1api->patch_node: %s\n" % e)
exit()

if __name__ == '__main__':
main()
main()
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ spec:
- -c
- |
iperf3 -s -p 6310 -D
/usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }}
/usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --intrusive-check-timer {{ .Values.intrusive }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
name: autopilot
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@ rules:
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get","list"]
verbs: ["get", "list"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "create", "delete"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "get"]
verbs: ["list", "get", "patch"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["list", "get"]
Expand Down
1 change: 1 addition & 0 deletions autopilot-daemon/helm-charts/autopilot/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ image:
PCIeBW: 4

repeat: 1
intrusive: 4

pullSecrets:
create: true
Expand Down
36 changes: 30 additions & 6 deletions autopilot-daemon/network/ping-entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import asyncio
import subprocess
import time
import netifaces

parser = argparse.ArgumentParser()
parser.add_argument('--job', type=str, default='None', help='Workload node discovery w/ given namespace and label. Ex: \"--job=namespace:label-key=label-value\". Default is set to None.')
Expand All @@ -26,6 +27,7 @@ async def main():
nodelabel = args['nodelabel']
nodemap = {}
allnodes = False
check_local_ifaces()
if 'all' in nodelist and job == 'None' and nodelabel == 'None':
allnodes = True
else:
Expand Down Expand Up @@ -72,8 +74,8 @@ async def main():
try:
iface=entry['interface']
except KeyError:
print("Interface key not found, assigning default.")
iface = "default"
print("Interface key name not found, assigning 'k8s-pod-network'.")
iface = "k8s-pod-network"
ifaces = ifaces | {iface}
node[iface] = {
'ips': entry['ips'],
Expand All @@ -95,9 +97,9 @@ async def main():
except KeyError:
print("Interface", iface, "not found, skipping.")
continue
for ip in ips:
for index, ip in enumerate(ips):
command = ['ping',ip,'-t','45','-c','10']
clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip))
clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip, "net-"+str(index)))
for c in clients:
try:
c[0].wait(50)
Expand All @@ -112,15 +114,37 @@ async def main():
print("FAIL")
else:
if "Unreachable" in stdout or "100% packet loss" in stdout:
print("Node", c[1], c[2], "1")
print("Node", c[1], c[2], c[3], "1")
fail = True
else:
print("Node", c[1], c[2], "0")
print("Node", c[1], c[2], c[3], "0")
if fail:
print("[PING] At least one node unreachable. FAIL")
else:
print("[PING] all nodes reachable. success")

def check_local_ifaces():
podname = os.getenv("POD_NAME")
pod_list = kubeapi.list_namespaced_pod(namespace=namespace_self, field_selector="metadata.name="+podname)
ips = []
pod_self = pod_list.items[0]
try:
entrylist = json.loads(pod_self.metadata.annotations['k8s.v1.cni.cncf.io/network-status'])
except KeyError:
print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod_self.metadata.name, "- Skipping node", pod_self.spec.node_name)
for entry in entrylist:
try:
iface=entry['interface']
except KeyError:
continue
ips.append(entry['ips'])
ifaces = netifaces.interfaces()
ifaces.remove('lo')
ifaces.remove('eth0')
if len(ips) > 0 and len(ifaces) == 0 :
print("[PING] IFACES count inconsistent. Pod annotation reports", ips, ", not found in the pod among", netifaces.interfaces(),"ABORT")
exit()

def get_job_nodes(nodelist):
v1 = client.CoreV1Api()
# get nodes from job is specified
Expand Down
21 changes: 13 additions & 8 deletions autopilot-daemon/pkg/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ func main() {
port := flag.String("port", "3333", "Port for the webhook to listen to. Defaulted to 3333")
bwThreshold := flag.String("bw", "4", "Sets bandwidth threshold for the init container")
logFile := flag.String("logfile", "", "File where to save all the events")
devmode := flag.Bool("dev", false, "Dev mode disables the execution of health checks at pod startup. Default set to True, therefore health checks are executed at pod startup first, and then periodically.")
v := flag.String("loglevel", "2", "Log level")
repeat := flag.Int("w", 24, "Run all tests periodically on each node. Time set in hours. Defaults to 24h")
intrusive := flag.Int("intrusive-check-timer", 4, "Run intrusive checks (e.g., dcgmi level 3) on each node when GPUs are free. Time set in hours. Defaults to 4h. Set to 0 to avoid intrusive checks")

flag.Parse()

Expand Down Expand Up @@ -99,16 +99,21 @@ func main() {
}
}()

if !*devmode {
handlers.TimerRun()
}
// Run the health checks at startup, then start the timer
handlers.PeriodicCheckTimer()

testsTicker := time.NewTicker(time.Duration(*repeat) * time.Hour)
defer testsTicker.Stop()
periodicChecksTicker := time.NewTicker(time.Duration(*repeat) * time.Hour)
defer periodicChecksTicker.Stop()
intrusiveChecksTicker := time.NewTicker(time.Duration(*intrusive) * time.Hour)
defer periodicChecksTicker.Stop()
for {
select {
case <-testsTicker.C:
handlers.TimerRun()
case <-periodicChecksTicker.C:
handlers.PeriodicCheckTimer()
case <-intrusiveChecksTicker.C:
if *intrusive > 0 {
handlers.IntrusiveCheckTimer()
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions autopilot-daemon/pkg/handlers/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"strings"

"github.com/IBM/autopilot/pkg/utils"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -64,6 +65,8 @@ func SystemStatusHandler() http.Handler {
if hosts == os.Getenv("NODE_NAME") {
klog.Info("Checking system status of host " + hosts + " (localhost)")
w.Write([]byte("Checking system status of host " + hosts + " (localhost) \n\n"))
utils.HealthcheckLock.Lock()
defer utils.HealthcheckLock.Unlock()
out, err := runAllTestsLocal(hosts, checks, dcgmR, jobName, nodelabel, r)
if err != nil {
klog.Error(err.Error())
Expand Down
Loading
0