From 820eeda4ad9731f66dce7b64d1fcfae1602440ad Mon Sep 17 00:00:00 2001 From: Katerina Molchanova <35141662+rokatyy@users.noreply.github.com> Date: Sun, 2 Mar 2025 08:49:02 +0000 Subject: [PATCH] [Processor] Improve termination stability and change readiness FailureThreshold (#3511) Jira - part of https://iguazio.atlassian.net/browse/NUC-298 --- docs/tasks/index.rst | 3 ++- docs/tasks/known-issues.md | 10 ++++++++++ pkg/platform/kube/functionres/lazy.go | 2 +- pkg/processor/trigger/trigger.go | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 docs/tasks/known-issues.md diff --git a/docs/tasks/index.rst b/docs/tasks/index.rst index 5fe4acba8f7..4fcb178bebd 100644 --- a/docs/tasks/index.rst +++ b/docs/tasks/index.rst @@ -10,4 +10,5 @@ User guide deploying-pre-built-functions deploy-functions-from-dockerfile exporting-and-importing - benchmarking \ No newline at end of file + benchmarking + known-issues \ No newline at end of file diff --git a/docs/tasks/known-issues.md b/docs/tasks/known-issues.md new file mode 100644 index 00000000000..65746c6400e --- /dev/null +++ b/docs/tasks/known-issues.md @@ -0,0 +1,10 @@ +## Known issues + +### 503 Error Code While Scaling Down (Kubernetes Only) + +This is a rare issue that primarily occurs in low-latency setups. +When scaling down a Nuclio function pod, Kubernetes may return a 503 error due to the delay between sending a `SIGTERM` signal and stopping traffic to the pod. + +To mitigate this, we've introduced a 5-second wait time before halting event processing after receiving `SIGTERM`. However, this does not fully eliminate the issue. + +**Possible Solution**: Implement a client-side retry mechanism when a 503 response is received with an empty body. \ No newline at end of file diff --git a/pkg/platform/kube/functionres/lazy.go b/pkg/platform/kube/functionres/lazy.go index 680f29d6188..fbd7a6dde0c 100644 --- a/pkg/platform/kube/functionres/lazy.go +++ b/pkg/platform/kube/functionres/lazy.go @@ -2376,7 +2376,7 @@ func (lc *lazyClient) populateDeploymentContainer(ctx context.Context, InitialDelaySeconds: 5, TimeoutSeconds: 1, PeriodSeconds: 1, - FailureThreshold: 10, + FailureThreshold: 3, } container.LivenessProbe = &v1.Probe{ diff --git a/pkg/processor/trigger/trigger.go b/pkg/processor/trigger/trigger.go index 786731808c5..f0e2946e8d3 100644 --- a/pkg/processor/trigger/trigger.go +++ b/pkg/processor/trigger/trigger.go @@ -386,7 +386,7 @@ func (at *AbstractTrigger) SignalWorkersToTerminate() error { // DO NOT REMOVE // this sleep is needed to give k8s some time to stop sending traffic to the service // before we shut worker down - time.Sleep(1 * time.Second) + time.Sleep(5 * time.Second) if err := at.WorkerAllocator.SignalTermination(); err != nil { return errors.Wrap(err, "Failed to signal all workers to terminate")