From 820eeda4ad9731f66dce7b64d1fcfae1602440ad Mon Sep 17 00:00:00 2001
From: Katerina Molchanova <35141662+rokatyy@users.noreply.github.com>
Date: Sun, 2 Mar 2025 08:49:02 +0000
Subject: [PATCH] [Processor] Improve termination stability and change
 readiness FailureThreshold (#3511)

Jira - part of https://iguazio.atlassian.net/browse/NUC-298
---
 docs/tasks/index.rst                  |  3 ++-
 docs/tasks/known-issues.md            | 10 ++++++++++
 pkg/platform/kube/functionres/lazy.go |  2 +-
 pkg/processor/trigger/trigger.go      |  2 +-
 4 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 docs/tasks/known-issues.md

diff --git a/docs/tasks/index.rst b/docs/tasks/index.rst
index 5fe4acba8f7..4fcb178bebd 100644
--- a/docs/tasks/index.rst
+++ b/docs/tasks/index.rst
@@ -10,4 +10,5 @@ User guide
    deploying-pre-built-functions
    deploy-functions-from-dockerfile
    exporting-and-importing
-   benchmarking
\ No newline at end of file
+   benchmarking
+   known-issues
\ No newline at end of file
diff --git a/docs/tasks/known-issues.md b/docs/tasks/known-issues.md
new file mode 100644
index 00000000000..65746c6400e
--- /dev/null
+++ b/docs/tasks/known-issues.md
@@ -0,0 +1,10 @@
+## Known issues
+
+### 503 Error Code While Scaling Down (Kubernetes Only)
+
+This is a rare issue that primarily occurs in low-latency setups.
+When scaling down a Nuclio function pod, Kubernetes may return a 503 error due to the delay between sending a `SIGTERM` signal and stopping traffic to the pod.
+
+To mitigate this, we've introduced a 5-second wait time before halting event processing after receiving `SIGTERM`. However, this does not fully eliminate the issue.
+
+**Possible Solution**: Implement a client-side retry mechanism when a 503 response is received with an empty body.
\ No newline at end of file
diff --git a/pkg/platform/kube/functionres/lazy.go b/pkg/platform/kube/functionres/lazy.go
index 680f29d6188..fbd7a6dde0c 100644
--- a/pkg/platform/kube/functionres/lazy.go
+++ b/pkg/platform/kube/functionres/lazy.go
@@ -2376,7 +2376,7 @@ func (lc *lazyClient) populateDeploymentContainer(ctx context.Context,
 		InitialDelaySeconds: 5,
 		TimeoutSeconds:      1,
 		PeriodSeconds:       1,
-		FailureThreshold:    10,
+		FailureThreshold:    3,
 	}
 
 	container.LivenessProbe = &v1.Probe{
diff --git a/pkg/processor/trigger/trigger.go b/pkg/processor/trigger/trigger.go
index 786731808c5..f0e2946e8d3 100644
--- a/pkg/processor/trigger/trigger.go
+++ b/pkg/processor/trigger/trigger.go
@@ -386,7 +386,7 @@ func (at *AbstractTrigger) SignalWorkersToTerminate() error {
 	// DO NOT REMOVE
 	// this sleep is needed to give k8s some time to stop sending traffic to the service
 	// before we shut worker down
-	time.Sleep(1 * time.Second)
+	time.Sleep(5 * time.Second)
 
 	if err := at.WorkerAllocator.SignalTermination(); err != nil {
 		return errors.Wrap(err, "Failed to signal all workers to terminate")