From aa3a357875f830328a416a76fb1b1e758b357334 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 10:30:40 +0000
Subject: [PATCH 01/20] fix(kubeflow): stream only rank 0 + last rank, write
 all ranks to disk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KubeflowExecutor.fetch_logs followed every replica and forwarded all ranks to
the caller, so at scale the aggregate output overran CI/runner job-log size
limits (a 16-node x 8-GPU run exceeded GitLab's 128MB cap). Now it still tails
every rank (kubectl logs -l <jobset> --prefix --max-log-requests num_nodes) and
writes the complete multi-rank output to <job_dir>/log-allranks_0.out, but
forwards only global rank 0 (node 0, [default0]) and the last global rank
(node num_nodes-1, [default nproc_per_node-1]) to stdout. Downstream log
validation that globs log*.out still sees every rank via the on-disk file.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 55 +++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 10 deletions(-)
diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index ea0d8cf0..5f87eb97 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -16,6 +16,7 @@
 import getpass
 import logging
 import os
+import re
 import subprocess
 import time
 from dataclasses import dataclass, field
@@ -331,6 +332,13 @@ def fetch_logs(
         until pods are running (up to 10 minutes).  Otherwise it returns the last
         *lines* lines from a single ``kubectl logs`` call.
         """
+        # Tail every rank, but forward only global rank 0 and the last global
+        # rank to the caller (stdout / CI job log). Streaming all ranks at scale
+        # overruns CI/runner job-log size limits, yet the full multi-rank output
+        # is still written to <job_dir>/log-allranks_0.out so downstream log
+        # validation (which globs log*.out) sees every rank. --prefix tags each
+        # line with its pod name so we can recover the node (completion index)
+        # and pair it with torchrun's [defaultN] local-rank marker.
         label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}"
         cmd = [
             "kubectl",
@@ -339,11 +347,29 @@ def fetch_logs(
             label_selector,
             "-n",
             self.namespace,
+            "--prefix",
             "--tail",
             str(lines),
             "--max-log-requests",
             str(self.num_nodes),
         ]
+        last_node = max(self.num_nodes - 1, 0)
+        last_local = max(self.nproc_per_node() - 1, 0)
+        node_re = re.compile(r"node-0-(\d+)-")
+        local_re = re.compile(r"\[default(\d+)\]")
+
+        def _forward_to_stdout(log_line: str) -> bool:
+            """True for global rank 0 (node 0, local 0) and the last global rank."""
+            node_match = node_re.search(log_line)
+            local_match = local_re.search(log_line)
+            if not node_match or not local_match:
+                return False
+            node, local = int(node_match.group(1)), int(local_match.group(1))
+            return (node == 0 and local == 0) or (node == last_node and local == last_local)
+
+        all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
+        os.makedirs(self.job_dir, exist_ok=True)
+
         if stream:
             cmd.append("-f")
             # Retry kubectl logs -f until the job reaches a terminal state.
@@ -354,16 +380,21 @@ def fetch_logs(
                 )
                 lines_yielded = 0
                 try:
-                    for line in iter(proc.stdout.readline, ""):
-                        if line:
-                            lines_yielded += 1
-                            yield line
-                        if proc.poll() is not None:
-                            for remaining in proc.stdout:
-                                if remaining:
+                    with open(all_ranks_path, "a") as all_ranks_file:
+                        for line in iter(proc.stdout.readline, ""):
+                            if line:
+                                all_ranks_file.write(line)
+                                if _forward_to_stdout(line):
                                     lines_yielded += 1
-                                    yield remaining
-                            break
+                                    yield line
+                            if proc.poll() is not None:
+                                for remaining in proc.stdout:
+                                    if remaining:
+                                        all_ranks_file.write(remaining)
+                                        if _forward_to_stdout(remaining):
+                                            lines_yielded += 1
+                                            yield remaining
+                                break
                 except Exception as e:
                     logger.warning("Error streaming logs: %s; retrying", e)
                 finally:
@@ -381,7 +412,11 @@ def fetch_logs(
                 time.sleep(5)
         else:
             result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
-            yield from result.stdout.splitlines()
+            with open(all_ranks_path, "a") as all_ranks_file:
+                for line in result.stdout.splitlines():
+                    all_ranks_file.write(line + "\n")
+                    if _forward_to_stdout(line):
+                        yield line
 
     def cancel(
         self,

From 8e1930e2b577ffaac2b244d355aa13537e0e41e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 12:01:57 +0000
Subject: [PATCH 02/20] fix(kubeflow): resolve last pod via completion-index
 label + full-history streaming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fetch_logs identified the last global rank's pod by parsing the pod name and
tailed only the last `--tail <lines>` window, so on (re)attach the last rank's
mid-run canonical "iteration | lm loss | ..." line (print_rank_last) was
dropped — on K8s the job log showed only rank 0's "Step Time" line.

Resolve the first/last pod from the authoritative
batch.kubernetes.io/job-completion-index label (== torchrun PET_NODE_RANK),
mapped from the --prefix pod name and refreshed on every (re)connect (gang
restarts spawn new pod names), and stream each pod's full history (--tail=-1)
so no mid-run line is missed. All ranks are still written to
log-allranks_0.out; only global rank 0 and the true last global rank are
forwarded to stdout.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 79 +++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 16 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 5f87eb97..76b44d55 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import getpass
+import json
 import logging
 import os
 import re
@@ -332,13 +333,22 @@ def fetch_logs(
         until pods are running (up to 10 minutes).  Otherwise it returns the last
         *lines* lines from a single ``kubectl logs`` call.
         """
-        # Tail every rank, but forward only global rank 0 and the last global
-        # rank to the caller (stdout / CI job log). Streaming all ranks at scale
-        # overruns CI/runner job-log size limits, yet the full multi-rank output
-        # is still written to <job_dir>/log-allranks_0.out so downstream log
-        # validation (which globs log*.out) sees every rank. --prefix tags each
-        # line with its pod name so we can recover the node (completion index)
-        # and pair it with torchrun's [defaultN] local-rank marker.
+        # Tail every rank to <job_dir>/log-allranks_0.out (downstream log
+        # validation globs log*.out and needs every rank), but forward only
+        # global rank 0 and the *last* global rank to the caller (stdout / CI
+        # job log) — streaming all ranks at scale overruns CI job-log limits.
+        #
+        # Identifying the last global rank requires the authoritative node rank,
+        # NOT the pod name. Kubeflow Trainer binds torchrun's PET_NODE_RANK to
+        # the indexed-Job completion index, stamped on each pod as the
+        # `batch.kubernetes.io/job-completion-index` label. So:
+        #     global_rank = job_completion_index * nproc_per_node + local_rank
+        # `--prefix` tags each line with `[pod/<pod>/<container>]`; we map that
+        # pod name → completion index (refreshed on every (re)connect, since a
+        # gang restart spawns new pod names) and pair it with torchrun's
+        # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full
+        # history on (re)attach so mid-run lines are never dropped (the previous
+        # `--tail <lines>` snapshot missed the last rank's per-step lines).
         label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}"
         cmd = [
             "kubectl",
@@ -349,22 +359,57 @@ def fetch_logs(
             self.namespace,
             "--prefix",
             "--tail",
-            str(lines),
+            "-1",
             "--max-log-requests",
             str(self.num_nodes),
         ]
         last_node = max(self.num_nodes - 1, 0)
         last_local = max(self.nproc_per_node() - 1, 0)
-        node_re = re.compile(r"node-0-(\d+)-")
+        pod_re = re.compile(r"pod/([^/]+)/")
         local_re = re.compile(r"\[default(\d+)\]")
 
-        def _forward_to_stdout(log_line: str) -> bool:
+        def _pod_index_map() -> dict[str, int]:
+            """Map pod name → job-completion-index (== torchrun node rank)."""
+            try:
+                out = subprocess.run(
+                    [
+                        "kubectl",
+                        "get",
+                        "pods",
+                        "-n",
+                        self.namespace,
+                        "-l",
+                        label_selector,
+                        "-o",
+                        "json",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout,
+                )
+                items = json.loads(out.stdout).get("items", [])
+            except Exception as e:
+                logger.warning("Could not list pods for %s: %s", job_name, e)
+                return {}
+            mapping: dict[str, int] = {}
+            for item in items:
+                meta = item.get("metadata", {})
+                name = meta.get("name")
+                idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index")
+                if name is not None and idx is not None:
+                    mapping[name] = int(idx)
+            return mapping
+
+        def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             """True for global rank 0 (node 0, local 0) and the last global rank."""
-            node_match = node_re.search(log_line)
+            pod_match = pod_re.search(log_line)
             local_match = local_re.search(log_line)
-            if not node_match or not local_match:
+            if not pod_match or not local_match:
+                return False
+            node = pod_index.get(pod_match.group(1))
+            if node is None:
                 return False
-            node, local = int(node_match.group(1)), int(local_match.group(1))
+            local = int(local_match.group(1))
             return (node == 0 and local == 0) or (node == last_node and local == last_local)
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
@@ -375,6 +420,7 @@ def _forward_to_stdout(log_line: str) -> bool:
             # Retry kubectl logs -f until the job reaches a terminal state.
             # This handles both pods not yet running and transient mid-stream failures.
             while True:
+                pod_index = _pod_index_map()
                 proc = subprocess.Popen(
                     cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
                 )
@@ -384,14 +430,14 @@ def _forward_to_stdout(log_line: str) -> bool:
                         for line in iter(proc.stdout.readline, ""):
                             if line:
                                 all_ranks_file.write(line)
-                                if _forward_to_stdout(line):
+                                if _forward_to_stdout(line, pod_index):
                                     lines_yielded += 1
                                     yield line
                             if proc.poll() is not None:
                                 for remaining in proc.stdout:
                                     if remaining:
                                         all_ranks_file.write(remaining)
-                                        if _forward_to_stdout(remaining):
+                                        if _forward_to_stdout(remaining, pod_index):
                                             lines_yielded += 1
                                             yield remaining
                                 break
@@ -411,11 +457,12 @@ def _forward_to_stdout(log_line: str) -> bool:
                 )
                 time.sleep(5)
         else:
+            pod_index = _pod_index_map()
             result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
             with open(all_ranks_path, "a") as all_ranks_file:
                 for line in result.stdout.splitlines():
                     all_ranks_file.write(line + "\n")
-                    if _forward_to_stdout(line):
+                    if _forward_to_stdout(line, pod_index):
                         yield line
 
     def cancel(

From b2be85c564960afd9d0d5ebdda88727460e245f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 12:30:21 +0000
Subject: [PATCH 03/20] fix(kubeflow): forward by global rank
 (node_rank*nproc+local), not pod heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Kubeflow Trainer sets torchrun PET_NODE_RANK statically from the JobSet
batch.kubernetes.io/job-completion-index, so global_rank = completion_index *
nproc_per_node + local_rank. Compute that explicitly and forward only global
rank 0 and world_size-1 to stdout (all ranks still go to log-allranks_0.out).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 76b44d55..33106fc5 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -363,8 +363,8 @@ def fetch_logs(
             "--max-log-requests",
             str(self.num_nodes),
         ]
-        last_node = max(self.num_nodes - 1, 0)
-        last_local = max(self.nproc_per_node() - 1, 0)
+        nproc = self.nproc_per_node()
+        last_rank = max(self.num_nodes * nproc - 1, 0)
         pod_re = re.compile(r"pod/([^/]+)/")
         local_re = re.compile(r"\[default(\d+)\]")
 
@@ -401,7 +401,14 @@ def _pod_index_map() -> dict[str, int]:
             return mapping
 
         def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
-            """True for global rank 0 (node 0, local 0) and the last global rank."""
+            """True for the first and last *global rank* only.
+
+            Kubeflow Trainer sets torchrun's PET_NODE_RANK from the JobSet
+            completion-index label (static), so the global rank is
+            ``node_rank * nproc_per_node + local_rank`` where node_rank is the
+            pod's completion index and local_rank is torchrun's ``[defaultN]``
+            marker. We forward global rank 0 and ``world_size - 1`` only.
+            """
             pod_match = pod_re.search(log_line)
             local_match = local_re.search(log_line)
             if not pod_match or not local_match:
@@ -409,8 +416,8 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             node = pod_index.get(pod_match.group(1))
             if node is None:
                 return False
-            local = int(local_match.group(1))
-            return (node == 0 and local == 0) or (node == last_node and local == last_local)
+            global_rank = node * nproc + int(local_match.group(1))
+            return global_rank == 0 or global_rank == last_rank
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
         os.makedirs(self.job_dir, exist_ok=True)

From 56bbb4bb1655bf515f607dea40b65b8015c8b684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 13:42:21 +0000
Subject: [PATCH 04/20] fix(kubeflow): make TrainJob launch idempotent on 409
 conflict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a TrainJob with the target name already exists, launch() raised and aborted. On CI the name is derived from the experiment id (commit SHA), so a 409 is a stale leftover from a prior attempt the launcher declared FAILED after a slow pod start. That blocked setup_experiment's 'attempt N of M' retry — every retry re-collided. Now launch() deletes the stale job (cancel(wait=True)) and recreates, so the retry can actually recover.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 33106fc5..11a2b5f0 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -261,11 +261,25 @@ def launch(
                 body=job_body,
             )
         except ApiException as e:
-            if e.status == 409:
-                raise RuntimeError(
-                    f"{_TRAINJOB_KIND} {name} already exists in namespace {self.namespace}"
-                ) from e
-            raise
+            if e.status != 409:
+                raise
+            # The job name is derived from the experiment id (the commit SHA on
+            # CI), so a 409 means a TrainJob from a prior attempt lingers — e.g.
+            # an attempt the launcher declared FAILED after a slow pod start.
+            # Delete the stale job and recreate so the caller's retry (such as
+            # setup_experiment's "attempt N of M") makes progress instead of
+            # re-colliding on the same name.
+            logger.warning(
+                "%s %s already exists; deleting stale job and recreating", _TRAINJOB_KIND, name
+            )
+            self.cancel(name, wait=True)
+            self._custom_objects_api.create_namespaced_custom_object(
+                group=_TRAINJOB_GROUP,
+                version=_TRAINJOB_VERSION,
+                namespace=self.namespace,
+                plural=_TRAINJOB_PLURAL,
+                body=job_body,
+            )
 
         logger.info("Submitted %s %s to namespace %s", _TRAINJOB_KIND, name, self.namespace)
 

From 3f0f5b419c922f3980bafc68b437f6bd98873dc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 22:17:16 +0000
Subject: [PATCH 05/20] fix(kubeflow): reload kube client across cert rotation
 for long runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kubernetes SDK bakes the client cert into its SSLContext at client
construction and never re-reads it. When credentials come from a rotating
source (Teleport tbot refreshing the cert on disk), a KubeflowExecutor
created once at launch keeps presenting the original cert until it expires
mid-run, so status polls fail with SSLV3_ALERT_CERTIFICATE_EXPIRED once the
run outlives the cert TTL (~60 min). Short jobs finish in time; multi-hour
jobs go blind.

Rebuild the API clients from the on-disk kubeconfig past a refresh interval
(below the cert TTL) via lazy properties, and reactively reload+retry once
in status() on a non-API connection error. fetch_logs already shells out to
kubectl, which re-reads creds per call, so it was unaffected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 83 ++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 13 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 11a2b5f0..a122e361 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -43,6 +43,14 @@
 _TRAINJOB_GROUP = "trainer.kubeflow.org"
 _TRAINJOB_VERSION = "v1alpha1"
 _TRAINJOB_PLURAL = "trainjobs"
+
+# The kubernetes SDK bakes the client cert into its SSLContext at construction
+# time and never re-reads it. When credentials come from a rotating source
+# (e.g. a Teleport tbot that refreshes the cert on disk), a long-running client
+# keeps presenting the original cert until it expires mid-run. Rebuilding the
+# API clients from the on-disk kubeconfig more frequently than the cert TTL
+# keeps a long run (multi-hour jobs) authenticated across rotations.
+_KUBE_CLIENT_REFRESH_SECONDS = 1500
 _TRAINJOB_KIND = "TrainJob"
 
 
@@ -109,6 +117,16 @@ def __post_init__(self):
                 "kubernetes package is required for KubeflowExecutor. "
                 "Install it with: pip install nemo-run[kubeflow]"
             )
+        self._load_kube_clients()
+
+    def _load_kube_clients(self) -> None:
+        """(Re)load the kubeconfig from disk and rebuild the API clients.
+
+        Called at init and again whenever the cached clients age past
+        ``_KUBE_CLIENT_REFRESH_SECONDS`` (see the module constant) so that a
+        rotating client cert (e.g. a Teleport tbot refreshing it on disk) is
+        picked up before the in-memory cert expires.
+        """
         try:
             config.load_kube_config()
         except Exception as original_exc:
@@ -116,8 +134,27 @@ def __post_init__(self):
                 config.load_incluster_config()
             except Exception:
                 raise original_exc
-        self._custom_objects_api = client.CustomObjectsApi()
-        self._core_v1_api = client.CoreV1Api()
+        self._co_api = client.CustomObjectsApi()
+        self._cv_api = client.CoreV1Api()
+        self._kube_clients_loaded_at = time.monotonic()
+
+    def _maybe_reload_kube_clients(self) -> None:
+        """Rebuild the API clients if they are older than the refresh interval."""
+        age = time.monotonic() - getattr(self, "_kube_clients_loaded_at", 0.0)
+        if age >= _KUBE_CLIENT_REFRESH_SECONDS:
+            self._load_kube_clients()
+
+    @property
+    def _custom_objects_api(self):
+        """CustomObjectsApi client, transparently refreshed across cert rotations."""
+        self._maybe_reload_kube_clients()
+        return self._co_api
+
+    @property
+    def _core_v1_api(self):
+        """CoreV1Api client, transparently refreshed across cert rotations."""
+        self._maybe_reload_kube_clients()
+        return self._cv_api
 
     # ── Executor interface ────────────────────────────────────────────────────
 
@@ -306,18 +343,38 @@ def launch(
 
     def status(self, job_name: str) -> Optional[KubeflowJobState]:
         """Return the current state of *job_name*, or ``None`` if it no longer exists."""
-        try:
-            resp = self._custom_objects_api.get_namespaced_custom_object(
-                group=_TRAINJOB_GROUP,
-                version=_TRAINJOB_VERSION,
-                namespace=self.namespace,
-                plural=_TRAINJOB_PLURAL,
-                name=job_name,
-            )
-        except ApiException as e:
-            if e.status == 404:
+        resp = None
+        for attempt in range(2):
+            try:
+                resp = self._custom_objects_api.get_namespaced_custom_object(
+                    group=_TRAINJOB_GROUP,
+                    version=_TRAINJOB_VERSION,
+                    namespace=self.namespace,
+                    plural=_TRAINJOB_PLURAL,
+                    name=job_name,
+                )
+                break
+            except ApiException as e:
+                if e.status == 404:
+                    return None
+                logger.warning("API error getting status for %s: %s", job_name, e)
+                return None
+            except Exception as e:
+                # Not an API-level error — most likely an expired client cert
+                # (tbot rotated it on disk but the SDK cached the old one) or a
+                # transient connection error. Force a client reload from the
+                # freshly-rotated kubeconfig and retry once.
+                if attempt == 0:
+                    logger.warning(
+                        "Connection error getting status for %s (%s); reloading kube client",
+                        job_name,
+                        e,
+                    )
+                    self._load_kube_clients()
+                    continue
+                logger.warning("Status check for %s failed after client reload: %s", job_name, e)
                 return None
-            logger.warning("API error getting status for %s: %s", job_name, e)
+        if resp is None:
             return None
 
         job_status = resp.get("status", {})

From b597e6cf98ce2bdc842dd298aa32b8851ead6151 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 22:30:10 +0000
Subject: [PATCH 06/20] fix(kubeflow): scope code_dir per job to avoid
 concurrent clobber
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

code_dir was scoped only per user (<pvc>/<username>/code), but package()
rsyncs each job's job_dir into it. Two concurrent jobs from the same user
(e.g. parallel CI test cases) therefore overwrite each other's launcher code
mid-run. Scope it per job (<username>/<experiment_id>/<job_name>/code),
matching how dgxcloud/lepton mirror job_dir into a per-job PVC subdir and how
slurm keys packaging by experiment_id:job_name.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index a122e361..3e9bfee5 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -173,10 +173,18 @@ def nnodes(self) -> int:
     def code_dir(self) -> str:
         """Subdirectory on the PVC where user code (launch.sh, scripts) is synced.
 
-        Scoped to ``<workdir_pvc_path>/<username>/code`` so multiple users sharing
-        the same PVC never clobber each other's files.
+        Scoped to ``<workdir_pvc_path>/<username>/<experiment_id>/<job_name>/code``
+        so that neither multiple users *nor* multiple concurrent jobs from the
+        same user clobber each other's launcher code on a shared PVC — each
+        ``package()`` rsyncs its ``job_dir`` here, so an unscoped path lets a
+        second job overwrite the first job's code mid-run. Falls back to a bare
+        ``<username>/code`` only before the executor is assigned to a task.
         """
-        return f"{self.workdir_pvc_path.rstrip('/')}/{getpass.getuser()}/code"
+        parts = [
+            p for p in (getattr(self, "experiment_id", None), getattr(self, "job_name", None)) if p
+        ]
+        scope = "/".join([getpass.getuser(), *parts])
+        return f"{self.workdir_pvc_path.rstrip('/')}/{scope}/code"
 
     def nproc_per_node(self) -> int:
         """Return processes per node: nprocs_per_node → gpus_per_node → 1."""

From 636ec9917bd41d5ed16f2de377cfafcffab05aad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 30 May 2026 23:00:21 +0000
Subject: [PATCH 07/20] fix(kubeflow): unique TrainJob name + forward all ranks
 (deduped)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- TrainJob name is now <basename>-<uuid6> (RFC-1123, <=33 chars) via the new
  train_job_basename field, decoupled from the experiment name. The uuid makes
  every launch unique, so concurrent/retried jobs never collide on the API
  server (the descriptive experiment name is intentionally non-unique).
- fetch_logs now forwards every rank to stdout, de-duplicated: torchrun runs
  the same entrypoint on all ranks so startup/config/NCCL lines are identical;
  we strip the per-rank [pod/...]/[defaultN] markers and forward each distinct
  message once. This stops dropping the per-step loss line and wandb URL, which
  Megatron emits from a single layout-dependent rank (neither rank 0 nor last).
  The full per-rank stream still goes to log-allranks_0.out untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 152 ++++++++++++++--------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 3e9bfee5..9d0dae47 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 
 import getpass
-import json
 import logging
 import os
 import re
 import subprocess
 import time
+import uuid
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Iterable, Optional
@@ -110,6 +110,10 @@ class KubeflowExecutor(Executor):
     # the PVC sync.  Use this to include local scripts/files that are not
     # generated by the packager (e.g. a hand-written training script).
     workdir_local_path: Optional[str] = None
+    # Human-readable base for the generated TrainJob name. The k8s name becomes
+    # ``<basename>-<uuid6>`` (RFC-1123 safe, ≤33 chars); the uuid keeps every
+    # launch unique. Falls back to the launch ``name`` when unset.
+    train_job_basename: Optional[str] = None
 
     def __post_init__(self):
         if not _KUBERNETES_AVAILABLE:
@@ -281,6 +285,26 @@ def get_job_body(self, name: str, command: list[str]) -> dict:
 
     # ── Submit / status / cancel / logs ──────────────────────────────────────
 
+    def _trainjob_name(self, fallback: str) -> str:
+        """RFC-1123 base name ``<basename>-<uuid6>`` (≤33 chars), generated once.
+
+        Shared by the TrainJob and its data-mover pod (created in ``package()``,
+        before ``launch()``) so both are valid, unique per launch — the uuid
+        avoids API-server collisions — and consistent. The basename is
+        ``train_job_basename`` (e.g. the model recipe) or the caller's name,
+        sanitized to lowercase alphanumerics + dashes; capped at 33 chars to
+        stay under the 63-char label limit with room for the ``-data-mover``
+        suffix.
+        """
+        cached = getattr(self, "_k8s_job_name", None)
+        if cached is not None:
+            return cached
+        base = re.sub(r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower()).strip("-")
+        uid = uuid.uuid4().hex[:6]
+        base = base[: 33 - len(uid) - 1].strip("-") or "job"
+        self._k8s_job_name = f"{base}-{uid}"
+        return self._k8s_job_name
+
     def launch(
         self,
         name: str,
@@ -295,7 +319,7 @@ def launch(
         observed ``RUNNING``, ``SUCCEEDED``, or ``FAILED`` state when *wait* is ``True``.
         Raises ``RuntimeError`` if the job already exists or *timeout* expires.
         """
-        name = name.replace("_", "-").replace(".", "-").lower()
+        name = self._trainjob_name(name)
         job_body = self.get_job_body(name, cmd)
         try:
             self._custom_objects_api.create_namespaced_custom_object(
@@ -413,21 +437,17 @@ def fetch_logs(
         *lines* lines from a single ``kubectl logs`` call.
         """
         # Tail every rank to <job_dir>/log-allranks_0.out (downstream log
-        # validation globs log*.out and needs every rank), but forward only
-        # global rank 0 and the *last* global rank to the caller (stdout / CI
-        # job log) — streaming all ranks at scale overruns CI job-log limits.
-        #
-        # Identifying the last global rank requires the authoritative node rank,
-        # NOT the pod name. Kubeflow Trainer binds torchrun's PET_NODE_RANK to
-        # the indexed-Job completion index, stamped on each pod as the
-        # `batch.kubernetes.io/job-completion-index` label. So:
-        #     global_rank = job_completion_index * nproc_per_node + local_rank
-        # `--prefix` tags each line with `[pod/<pod>/<container>]`; we map that
-        # pod name → completion index (refreshed on every (re)connect, since a
-        # gang restart spawns new pod names) and pair it with torchrun's
-        # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full
-        # history on (re)attach so mid-run lines are never dropped (the previous
-        # `--tail <lines>` snapshot missed the last rank's per-step lines).
+        # validation globs log*.out and needs every rank). Forward all ranks to
+        # the caller (stdout / CI job log) too, but de-duplicated: torchrun runs
+        # the same entrypoint on every rank, so the bulk of the volume (startup,
+        # config dump, NCCL init) is byte-identical across ranks. We forward each
+        # distinct message once — which is also why the rank-specific loss line
+        # (emitted by a single, parallelism-layout-dependent rank that is usually
+        # neither rank 0 nor the last rank) and genuine per-rank errors are no
+        # longer dropped. `--prefix` tags each line with `[pod/<pod>/<container>]`
+        # and torchrun adds `[defaultN]`; both are stripped to form the dedup key.
+        # `--tail=-1` replays each pod's full history on (re)attach so mid-run
+        # lines are never dropped.
         label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}"
         cmd = [
             "kubectl",
@@ -442,61 +462,43 @@ def fetch_logs(
             "--max-log-requests",
             str(self.num_nodes),
         ]
-        nproc = self.nproc_per_node()
-        last_rank = max(self.num_nodes * nproc - 1, 0)
-        pod_re = re.compile(r"pod/([^/]+)/")
-        local_re = re.compile(r"\[default(\d+)\]")
-
-        def _pod_index_map() -> dict[str, int]:
-            """Map pod name → job-completion-index (== torchrun node rank)."""
-            try:
-                out = subprocess.run(
-                    [
-                        "kubectl",
-                        "get",
-                        "pods",
-                        "-n",
-                        self.namespace,
-                        "-l",
-                        label_selector,
-                        "-o",
-                        "json",
-                    ],
-                    capture_output=True,
-                    text=True,
-                    timeout=timeout,
-                )
-                items = json.loads(out.stdout).get("items", [])
-            except Exception as e:
-                logger.warning("Could not list pods for %s: %s", job_name, e)
-                return {}
-            mapping: dict[str, int] = {}
-            for item in items:
-                meta = item.get("metadata", {})
-                name = meta.get("name")
-                idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index")
-                if name is not None and idx is not None:
-                    mapping[name] = int(idx)
-            return mapping
-
-        def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
-            """True for the first and last *global rank* only.
-
-            Kubeflow Trainer sets torchrun's PET_NODE_RANK from the JobSet
-            completion-index label (static), so the global rank is
-            ``node_rank * nproc_per_node + local_rank`` where node_rank is the
-            pod's completion index and local_rank is torchrun's ``[defaultN]``
-            marker. We forward global rank 0 and ``world_size - 1`` only.
-            """
-            pod_match = pod_re.search(log_line)
-            local_match = local_re.search(log_line)
-            if not pod_match or not local_match:
+        # Collapse the near-simultaneous cross-rank burst with a *sliding time
+        # window* (cf. ClusterShell `clush -b`, which gathers identical output
+        # across nodes into one line). torchrun runs the same entrypoint on
+        # every rank, so startup/config/NCCL lines arrive as a burst of
+        # byte-identical copies; we strip the per-rank `[pod/<pod>/<container>]`
+        # and `[defaultN]` markers to form a dedup key and suppress a key only
+        # if an identical line was already forwarded within `dedup_window_s`.
+        # Unlike a global set this is bounded in both memory and time: a line
+        # that legitimately recurs later (e.g. a periodic "saving checkpoint")
+        # is forwarded again once the window passes, and a continuously
+        # repeating line is rate-limited to once per window rather than
+        # suppressed for the whole run. Lines whose body differs across ranks
+        # (per-step loss, `[rankN]` errors) keep distinct keys and are never
+        # collapsed. The full per-rank stream still goes to log-allranks_0.out.
+        rank_marker_re = re.compile(r"\[pod/[^\]]+\]\s*|\[default\d+\]:?\s*")
+        dedup_window_s = 60.0
+        last_forwarded: dict[str, float] = {}
+
+        def _should_forward(log_line: str) -> bool:
+            key = rank_marker_re.sub("", log_line).strip()
+            if not key:
+                # Blank / prefix-only line: no content, so don't forward it to
+                # the CI log (every rank emits these; they're pure noise). The
+                # full per-rank stream still captures them in log-allranks_0.out.
                 return False
-            node = pod_index.get(pod_match.group(1))
-            if node is None:
+            now = time.monotonic()
+            prev = last_forwarded.get(key)
+            if prev is not None and now - prev < dedup_window_s:
                 return False
-            global_rank = node * nproc + int(local_match.group(1))
-            return global_rank == 0 or global_rank == last_rank
+            last_forwarded[key] = now
+            # Bound memory: once the map is large, drop keys older than the
+            # window (they can no longer suppress anything).
+            if len(last_forwarded) > 20000:
+                stale = now - dedup_window_s
+                for k in [k for k, t in last_forwarded.items() if t < stale]:
+                    del last_forwarded[k]
+            return True
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
         os.makedirs(self.job_dir, exist_ok=True)
@@ -506,7 +508,6 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             # Retry kubectl logs -f until the job reaches a terminal state.
             # This handles both pods not yet running and transient mid-stream failures.
             while True:
-                pod_index = _pod_index_map()
                 proc = subprocess.Popen(
                     cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
                 )
@@ -516,14 +517,14 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                         for line in iter(proc.stdout.readline, ""):
                             if line:
                                 all_ranks_file.write(line)
-                                if _forward_to_stdout(line, pod_index):
+                                if _should_forward(line):
                                     lines_yielded += 1
                                     yield line
                             if proc.poll() is not None:
                                 for remaining in proc.stdout:
                                     if remaining:
                                         all_ranks_file.write(remaining)
-                                        if _forward_to_stdout(remaining, pod_index):
+                                        if _should_forward(remaining):
                                             lines_yielded += 1
                                             yield remaining
                                 break
@@ -543,12 +544,11 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                 )
                 time.sleep(5)
         else:
-            pod_index = _pod_index_map()
             result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
             with open(all_ranks_path, "a") as all_ranks_file:
                 for line in result.stdout.splitlines():
                     all_ranks_file.write(line + "\n")
-                    if _forward_to_stdout(line, pod_index):
+                    if _should_forward(line):
                         yield line
 
     def cancel(
@@ -614,7 +614,7 @@ def cancel(
     # ── Workdir sync helpers ──────────────────────────────────────────────────
 
     def _data_mover_pod_name(self, job_name: str) -> str:
-        return f"{job_name}-data-mover"
+        return f"{self._trainjob_name(job_name)}-data-mover"
 
     def _start_data_mover_pod(self, pod_name: str, timeout: int = 120) -> None:
         """Spin up a throw-away Alpine pod that mounts workdir_pvc and blocks until Running.

From 7af08d28131cb52bb30b82fd16287c23aa291d1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 00:09:18 +0000
Subject: [PATCH 08/20] fix(kubeflow): stream logs once, not per replica
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

torchx calls scheduler.log_iter(app_id, role_name, k=...) once per replica
(k = 0..num_nodes-1). The Kubeflow log_iter ignored k and re-ran
fetch_logs — which tails the entire jobset via the jobset-name selector — for
every replica, producing N independent tail streams (each with its own dedup
state) and N-fold-duplicating every console line (prefixed <role>/<k>). At 16
nodes that's 16x the log volume, which also overruns the CI job-log limit on
long runs. Stream only for k == 0; that single tail already covers all ranks
(and writes log-allranks_0.out once).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/run/torchx_backend/schedulers/kubeflow.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nemo_run/run/torchx_backend/schedulers/kubeflow.py b/nemo_run/run/torchx_backend/schedulers/kubeflow.py
index be6e9db8..67d3ccf1 100644
--- a/nemo_run/run/torchx_backend/schedulers/kubeflow.py
+++ b/nemo_run/run/torchx_backend/schedulers/kubeflow.py
@@ -188,6 +188,14 @@ def log_iter(
         if not executor:
             return []
 
+        # fetch_logs tails ALL pods of the jobset in a single call (it powers the
+        # log-allranks_0.out capture and the cross-rank dedup). torchx invokes
+        # log_iter once per replica (k = 0..num_nodes-1); streaming on every k
+        # would re-tail the whole jobset N times — each tail with its own dedup
+        # state — and N×-duplicate every console line. Stream only for k == 0.
+        if k != 0:
+            return []
+
         logs = executor.fetch_logs(job_name=job_name, stream=should_tail)
         if isinstance(logs, str):
             if len(logs) == 0:

From 22168ed13ebb9c8d674b3327f3aae6d9580c07ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 00:22:08 +0000
Subject: [PATCH 09/20] fix(kubeflow): forward rank 0 + last rank to stdout
 (not all-ranks dedup)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert the all-ranks sliding-window dedup back to forwarding only global rank 0
(setup/config) and the last global rank (print_rank_last per-step loss), like a
SLURM job log. The last rank is resolved at stream time from each pod's
batch.kubernetes.io/job-completion-index label (== torchrun --node-rank
$PET_NODE_RANK), so global_rank = completion_index * nproc_per_node +
local_rank is deterministic without any topology enforcement. The full per-rank
stream is still captured in log-allranks_0.out. Combined with the per-replica
log_iter guard, this stops the N-fold duplication and yields a clean two-rank
console.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 91 ++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 9d0dae47..1e5e25f8 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import getpass
+import json
 import logging
 import os
 import re
@@ -437,17 +438,20 @@ def fetch_logs(
         *lines* lines from a single ``kubectl logs`` call.
         """
         # Tail every rank to <job_dir>/log-allranks_0.out (downstream log
-        # validation globs log*.out and needs every rank). Forward all ranks to
-        # the caller (stdout / CI job log) too, but de-duplicated: torchrun runs
-        # the same entrypoint on every rank, so the bulk of the volume (startup,
-        # config dump, NCCL init) is byte-identical across ranks. We forward each
-        # distinct message once — which is also why the rank-specific loss line
-        # (emitted by a single, parallelism-layout-dependent rank that is usually
-        # neither rank 0 nor the last rank) and genuine per-rank errors are no
-        # longer dropped. `--prefix` tags each line with `[pod/<pod>/<container>]`
-        # and torchrun adds `[defaultN]`; both are stripped to form the dedup key.
-        # `--tail=-1` replays each pod's full history on (re)attach so mid-run
-        # lines are never dropped.
+        # validation globs log*.out and needs every rank), but forward only
+        # global rank 0 and the *last* global rank to the caller (stdout / CI
+        # job log) — rank 0 carries setup/config, the last rank carries
+        # Megatron's print_rank_last per-step loss/throughput.
+        #
+        # The last global rank is resolved at stream time, not the pod name:
+        # the trainer runs `torchrun --node-rank $PET_NODE_RANK`, and the runtime
+        # sets PET_NODE_RANK = the pod's `batch.kubernetes.io/job-completion-index`
+        # label, so global_rank = completion_index * nproc_per_node + local_rank
+        # is deterministic. `--prefix` tags each line `[pod/<pod>/<container>]`; we
+        # map that pod name → completion index (re-read on every (re)connect, since
+        # a gang restart spawns new pod names) and pair it with torchrun's
+        # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full
+        # history on (re)attach so mid-run lines are never dropped.
         label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}"
         cmd = [
             "kubectl",
@@ -476,29 +480,44 @@ def fetch_logs(
         # suppressed for the whole run. Lines whose body differs across ranks
         # (per-step loss, `[rankN]` errors) keep distinct keys and are never
         # collapsed. The full per-rank stream still goes to log-allranks_0.out.
-        rank_marker_re = re.compile(r"\[pod/[^\]]+\]\s*|\[default\d+\]:?\s*")
-        dedup_window_s = 60.0
-        last_forwarded: dict[str, float] = {}
-
-        def _should_forward(log_line: str) -> bool:
-            key = rank_marker_re.sub("", log_line).strip()
-            if not key:
-                # Blank / prefix-only line: no content, so don't forward it to
-                # the CI log (every rank emits these; they're pure noise). The
-                # full per-rank stream still captures them in log-allranks_0.out.
+        nproc = self.nproc_per_node()
+        last_rank = max(self.num_nodes * nproc - 1, 0)
+        pod_re = re.compile(r"pod/([^/]+)/")
+        local_re = re.compile(r"\[default(\d+)\]")
+
+        def _pod_index_map() -> dict[str, int]:
+            """Map pod name → job-completion-index (== torchrun node rank)."""
+            try:
+                out = subprocess.run(
+                    ["kubectl", "get", "pods", "-n", self.namespace, "-l", label_selector, "-o", "json"],
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout,
+                )
+                items = json.loads(out.stdout).get("items", [])
+            except Exception as e:
+                logger.warning("Could not list pods for %s: %s", job_name, e)
+                return {}
+            mapping: dict[str, int] = {}
+            for item in items:
+                meta = item.get("metadata", {})
+                name = meta.get("name")
+                idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index")
+                if name is not None and idx is not None:
+                    mapping[name] = int(idx)
+            return mapping
+
+        def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
+            """True for global rank 0 and the last global rank only."""
+            pod_match = pod_re.search(log_line)
+            local_match = local_re.search(log_line)
+            if not pod_match or not local_match:
                 return False
-            now = time.monotonic()
-            prev = last_forwarded.get(key)
-            if prev is not None and now - prev < dedup_window_s:
+            node = pod_index.get(pod_match.group(1))
+            if node is None:
                 return False
-            last_forwarded[key] = now
-            # Bound memory: once the map is large, drop keys older than the
-            # window (they can no longer suppress anything).
-            if len(last_forwarded) > 20000:
-                stale = now - dedup_window_s
-                for k in [k for k, t in last_forwarded.items() if t < stale]:
-                    del last_forwarded[k]
-            return True
+            global_rank = node * nproc + int(local_match.group(1))
+            return global_rank == 0 or global_rank == last_rank
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
         os.makedirs(self.job_dir, exist_ok=True)
@@ -508,6 +527,7 @@ def _should_forward(log_line: str) -> bool:
             # Retry kubectl logs -f until the job reaches a terminal state.
             # This handles both pods not yet running and transient mid-stream failures.
             while True:
+                pod_index = _pod_index_map()
                 proc = subprocess.Popen(
                     cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
                 )
@@ -517,14 +537,14 @@ def _should_forward(log_line: str) -> bool:
                         for line in iter(proc.stdout.readline, ""):
                             if line:
                                 all_ranks_file.write(line)
-                                if _should_forward(line):
+                                if _forward_to_stdout(line, pod_index):
                                     lines_yielded += 1
                                     yield line
                             if proc.poll() is not None:
                                 for remaining in proc.stdout:
                                     if remaining:
                                         all_ranks_file.write(remaining)
-                                        if _should_forward(remaining):
+                                        if _forward_to_stdout(remaining, pod_index):
                                             lines_yielded += 1
                                             yield remaining
                                 break
@@ -544,11 +564,12 @@ def _should_forward(log_line: str) -> bool:
                 )
                 time.sleep(5)
         else:
+            pod_index = _pod_index_map()
             result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
             with open(all_ranks_path, "a") as all_ranks_file:
                 for line in result.stdout.splitlines():
                     all_ranks_file.write(line + "\n")
-                    if _should_forward(line):
+                    if _forward_to_stdout(line, pod_index):
                         yield line
 
     def cancel(

From c0d800d034a295171332056eb32aaeb9ec48e25f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 00:30:38 +0000
Subject: [PATCH 10/20] fix(kubeflow): forward rank-0 + the actual loss-rank
 slot to stdout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The c10d rendezvous assigns torch ranks by join order, not by JobSet
completion-index, so torch's world_size-1 (print_rank_last's loss line) does
NOT land on the highest completion-index. Verified on a live 16-node job:
the loss prints on completion-index 9 (= num_nodes//2 + 1), local rank
nproc-1 — not index 15. Forward exactly (index 0, local 0) and
(index num_nodes//2 + 1, local nproc-1) so the console shows rank 0 setup +
the per-step loss/throughput. Full per-rank capture remains in
log-allranks_0.out. A deterministic completion-index->rank mapping
(topology/static rank ordering) would let us compute this rather than match
the observed slot.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 1e5e25f8..68dcb094 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -481,7 +481,6 @@ def fetch_logs(
         # (per-step loss, `[rankN]` errors) keep distinct keys and are never
         # collapsed. The full per-rank stream still goes to log-allranks_0.out.
         nproc = self.nproc_per_node()
-        last_rank = max(self.num_nodes * nproc - 1, 0)
         pod_re = re.compile(r"pod/([^/]+)/")
         local_re = re.compile(r"\[default(\d+)\]")
 
@@ -507,8 +506,20 @@ def _pod_index_map() -> dict[str, int]:
                     mapping[name] = int(idx)
             return mapping
 
+        # Two ranks worth surfacing to the CI console: rank 0 (setup/config) at
+        # completion-index 0 / local 0, and the rank that emits print_rank_last's
+        # per-step loss/throughput. The c10d rendezvous does NOT map completion
+        # index to torch rank identically (it assigns by join order), so torch's
+        # world_size-1 does not land on the highest completion-index. Empirically
+        # on this JobSet it lands on completion-index `num_nodes//2 + 1`, local
+        # rank `nproc-1` (e.g. 16 nodes → index 9; default7 on 8-GPU, default3 on
+        # 4-GPU). Match those two slots directly. (A deterministic completion
+        # index→rank mapping — e.g. topology-aware/static rank ordering — would
+        # let us compute this instead of relying on the observed slot.)
+        last_node = self.num_nodes // 2 + 1
+
         def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
-            """True for global rank 0 and the last global rank only."""
+            """True only for (index 0, local 0) and (index num_nodes//2+1, local nproc-1)."""
             pod_match = pod_re.search(log_line)
             local_match = local_re.search(log_line)
             if not pod_match or not local_match:
@@ -516,8 +527,8 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             node = pod_index.get(pod_match.group(1))
             if node is None:
                 return False
-            global_rank = node * nproc + int(local_match.group(1))
-            return global_rank == 0 or global_rank == last_rank
+            local = int(local_match.group(1))
+            return (node == 0 and local == 0) or (node == last_node and local == nproc - 1)
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
         os.makedirs(self.job_dir, exist_ok=True)

From e8a64f5951002215b1f264df119c745792821b63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 09:44:48 +0000
Subject: [PATCH 11/20] fix(kubeflow): robust log streaming across
 pod/container restarts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fetch_logs ran a single 'kubectl logs -l -f --max-log-requests <num_nodes>'.
That follow only attaches to pods present at start, never re-attaches to a
container that restarts, and --max-log-requests == pod count has no headroom —
so a gang/NCCL-init restart that transiently doubled the matching-pod count
errored the whole command ('maximum allowed concurrency') and silently dropped
pods. Observed: a 16-node job streamed only node-0-0; the loss rank (node-0-9)
never appeared even though it was emitting per-step loss.

- --max-log-requests = max(num_nodes*2, 8): headroom for restart-transient pods.
- Periodically re-attach (threading.Timer terminates the follow every 120s) so
  pods that (re)started after the initial attach are picked up.
- Resume reconnects with --since-time (via --timestamps), tracking the max
  RFC3339 stamp, so re-attaching never replays already-emitted history; only the
  first attach uses --tail=-1. The kubectl timestamp is stripped from each line.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 106 +++++++++++++++-------------
 1 file changed, 55 insertions(+), 51 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 68dcb094..745a9ba9 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -19,6 +19,7 @@
 import os
 import re
 import subprocess
+import threading
 import time
 import uuid
 from dataclasses import dataclass, field
@@ -446,14 +447,22 @@ def fetch_logs(
         # The last global rank is resolved at stream time, not the pod name:
         # the trainer runs `torchrun --node-rank $PET_NODE_RANK`, and the runtime
         # sets PET_NODE_RANK = the pod's `batch.kubernetes.io/job-completion-index`
-        # label, so global_rank = completion_index * nproc_per_node + local_rank
-        # is deterministic. `--prefix` tags each line `[pod/<pod>/<container>]`; we
-        # map that pod name → completion index (re-read on every (re)connect, since
-        # a gang restart spawns new pod names) and pair it with torchrun's
-        # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full
-        # history on (re)attach so mid-run lines are never dropped.
+        # label. `--prefix` tags each line `[pod/<pod>/<container>]`; we map that
+        # pod name → completion index and pair it with torchrun's `[defaultN]`
+        # local-rank marker (see _forward_to_stdout).
+        #
+        # Robust streaming: `kubectl logs -l -f` only follows the pods present
+        # when it attaches and never re-attaches to a container that restarts, so
+        # a single long-lived follow silently drops pods after a gang/NCCL-init
+        # restart. We therefore (a) give --max-log-requests headroom so a restart
+        # that transiently doubles the matching-pod count can't error the whole
+        # command ("maximum allowed concurrency"), and (b) periodically re-attach.
+        # To avoid re-emitting the full history on every re-attach, reconnects
+        # resume via `--since-time` (with `--timestamps`); only the first attach
+        # uses `--tail=-1` to capture pre-existing history.
         label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}"
-        cmd = [
+        max_log_requests = max(self.num_nodes * 2, 8)
+        base_cmd = [
             "kubectl",
             "logs",
             "-l",
@@ -461,25 +470,21 @@ def fetch_logs(
             "-n",
             self.namespace,
             "--prefix",
-            "--tail",
-            "-1",
             "--max-log-requests",
-            str(self.num_nodes),
+            str(max_log_requests),
         ]
-        # Collapse the near-simultaneous cross-rank burst with a *sliding time
-        # window* (cf. ClusterShell `clush -b`, which gathers identical output
-        # across nodes into one line). torchrun runs the same entrypoint on
-        # every rank, so startup/config/NCCL lines arrive as a burst of
-        # byte-identical copies; we strip the per-rank `[pod/<pod>/<container>]`
-        # and `[defaultN]` markers to form a dedup key and suppress a key only
-        # if an identical line was already forwarded within `dedup_window_s`.
-        # Unlike a global set this is bounded in both memory and time: a line
-        # that legitimately recurs later (e.g. a periodic "saving checkpoint")
-        # is forwarded again once the window passes, and a continuously
-        # repeating line is rate-limited to once per window rather than
-        # suppressed for the whole run. Lines whose body differs across ranks
-        # (per-step loss, `[rankN]` errors) keep distinct keys and are never
-        # collapsed. The full per-rank stream still goes to log-allranks_0.out.
+        # `--prefix --timestamps` lines look like:
+        #   [pod/<pod>/<container>] <RFC3339> [defaultN]: <message>
+        # Track the max RFC3339 stamp to resume via --since-time, and strip it so
+        # downstream sees the original `[pod/...] [defaultN]: <message>`.
+        ts_re = re.compile(r"^(\[pod/[^\]]+\])\s+(\d{4}-\d\d-\d\dT[\d:.]+Z)\s")
+
+        def _split_ts(line: str) -> tuple[Optional[str], str]:
+            m = ts_re.match(line)
+            if not m:
+                return None, line
+            return m.group(2), m.group(1) + " " + line[m.end() :]
+
         nproc = self.nproc_per_node()
         pod_re = re.compile(r"pod/([^/]+)/")
         local_re = re.compile(r"\[default(\d+)\]")
@@ -534,49 +539,48 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
         os.makedirs(self.job_dir, exist_ok=True)
 
         if stream:
-            cmd.append("-f")
-            # Retry kubectl logs -f until the job reaches a terminal state.
-            # This handles both pods not yet running and transient mid-stream failures.
+            reattach_interval_s = 120.0
+            since_time: Optional[str] = None
             while True:
                 pod_index = _pod_index_map()
+                attempt_cmd = base_cmd + ["--timestamps", "-f"]
+                # First attach replays history (--tail=-1); reconnects resume from
+                # the last seen timestamp so re-attaching never re-emits old lines.
+                attempt_cmd += ["--tail", "-1"] if since_time is None else ["--since-time", since_time]
                 proc = subprocess.Popen(
-                    cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
+                    attempt_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
                 )
-                lines_yielded = 0
+                # Force a periodic re-attach (terminate → reconnect) so pods that
+                # (re)started after this attach are picked up; --since-time keeps
+                # the reconnect from replaying history.
+                reattach_timer = threading.Timer(reattach_interval_s, proc.terminate)
+                reattach_timer.start()
                 try:
                     with open(all_ranks_path, "a") as all_ranks_file:
-                        for line in iter(proc.stdout.readline, ""):
-                            if line:
-                                all_ranks_file.write(line)
-                                if _forward_to_stdout(line, pod_index):
-                                    lines_yielded += 1
-                                    yield line
-                            if proc.poll() is not None:
-                                for remaining in proc.stdout:
-                                    if remaining:
-                                        all_ranks_file.write(remaining)
-                                        if _forward_to_stdout(remaining, pod_index):
-                                            lines_yielded += 1
-                                            yield remaining
-                                break
+                        for raw in iter(proc.stdout.readline, ""):
+                            if not raw:
+                                continue
+                            ts, line = _split_ts(raw)
+                            if ts is not None and (since_time is None or ts > since_time):
+                                since_time = ts
+                            all_ranks_file.write(line)
+                            if _forward_to_stdout(line, pod_index):
+                                yield line
                 except Exception as e:
                     logger.warning("Error streaming logs: %s; retrying", e)
                 finally:
+                    reattach_timer.cancel()
                     proc.terminate()
                     proc.wait(timeout=2)
                 state = self.status(job_name)
                 if state in (KubeflowJobState.SUCCEEDED, KubeflowJobState.FAILED):
                     break  # job reached a terminal state, stop streaming
-                logger.warning(
-                    "kubectl logs exited (rc=%d, lines=%d, state=%s); retrying",
-                    proc.returncode,
-                    lines_yielded,
-                    state,
-                )
-                time.sleep(5)
+                time.sleep(2)
         else:
             pod_index = _pod_index_map()
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+            result = subprocess.run(
+                base_cmd + ["--tail", "-1"], capture_output=True, text=True, timeout=timeout
+            )
             with open(all_ranks_path, "a") as all_ranks_file:
                 for line in result.stdout.splitlines():
                     all_ranks_file.write(line + "\n")

From ec27ed58d5a150630ff8f59ffe075cf609af7353 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 09:53:39 +0000
Subject: [PATCH 12/20] fix(kubeflow): resolve rank-0/last pods from worker
 GROUP_RANK, not a heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The console forwarded rank 0 + the loss rank using completion-index with an
empirical 'num_nodes//2+1' slot for world_size-1. That's fragile: the c10d
rendezvous assigns torch ranks by join order, not JobSet completion-index, so
the loss rank lands on an unpredictable pod (observed: completion-index 9 was
actually GROUP_RANK 15 = RANK 63 = world_size-1).

Read the ground truth instead: torchrun exports GROUP_RANK into every worker's
/proc/<pid>/environ, so 'kubectl exec <pod> -- ' reading it tells us exactly
which pod holds GROUP_RANK 0 (RANK 0, local 0) and GROUP_RANK num_nodes-1
(RANK world_size-1, local nproc-1). Resolve the pod->GROUP_RANK map once the
workers exist, cache it, and re-resolve when the rank-0/last pod is no longer
covered (gang restart reshuffles ranks). Until workers come up (empty map),
fall back to the completion-index-0 pod so early setup still streams.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 71 ++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 745a9ba9..3e287f72 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -511,29 +511,66 @@ def _pod_index_map() -> dict[str, int]:
                     mapping[name] = int(idx)
             return mapping
 
-        # Two ranks worth surfacing to the CI console: rank 0 (setup/config) at
-        # completion-index 0 / local 0, and the rank that emits print_rank_last's
-        # per-step loss/throughput. The c10d rendezvous does NOT map completion
-        # index to torch rank identically (it assigns by join order), so torch's
-        # world_size-1 does not land on the highest completion-index. Empirically
-        # on this JobSet it lands on completion-index `num_nodes//2 + 1`, local
-        # rank `nproc-1` (e.g. 16 nodes → index 9; default7 on 8-GPU, default3 on
-        # 4-GPU). Match those two slots directly. (A deterministic completion
-        # index→rank mapping — e.g. topology-aware/static rank ordering — would
-        # let us compute this instead of relying on the observed slot.)
-        last_node = self.num_nodes // 2 + 1
+        # Forward only RANK 0 (setup/config) and RANK world_size-1 (Megatron's
+        # print_rank_last per-step loss/throughput). The c10d rendezvous assigns
+        # torch ranks by join order, NOT by JobSet completion-index, so we read
+        # the ground truth: torchrun exports GROUP_RANK (the node rank) into every
+        # worker's /proc/<pid>/environ. The pod whose worker has GROUP_RANK 0
+        # holds RANK 0 (local 0); the pod with GROUP_RANK num_nodes-1 holds
+        # RANK world_size-1 (local nproc-1). The map is resolved once the workers
+        # exist (post-rendezvous) and cached; it is re-resolved when the rank-0 or
+        # last pod is no longer covered (a gang restart reshuffles ranks). Before
+        # the workers come up (empty map) we fall back to the completion-index-0
+        # pod so early setup output still streams.
+        last_group_rank = self.num_nodes - 1
+        group_rank_map: dict[str, int] = {}
+
+        def _read_group_rank(pod: str) -> Optional[int]:
+            """Read a torchrun worker's GROUP_RANK from /proc/<pid>/environ in *pod*."""
+            script = (
+                "for e in /proc/[0-9]*/environ; do "
+                "g=$(tr '\\0' '\\n' < \"$e\" 2>/dev/null | grep -m1 '^GROUP_RANK='); "
+                "[ -n \"$g\" ] && { echo \"$g\"; break; }; done"
+            )
+            try:
+                out = subprocess.run(
+                    ["kubectl", "exec", pod, "-n", self.namespace, "-c", "node", "--", "sh", "-c", script],
+                    capture_output=True,
+                    text=True,
+                    timeout=min(timeout, 30),
+                )
+            except Exception:
+                return None
+            m = re.search(r"GROUP_RANK=(\d+)", out.stdout)
+            return int(m.group(1)) if m else None
+
+        def _ensure_group_ranks(current_pods: set[str]) -> None:
+            """Resolve pod → GROUP_RANK via worker environ if rank 0 / last not yet covered."""
+            covered = {group_rank_map[p] for p in current_pods if p in group_rank_map}
+            if 0 in covered and last_group_rank in covered:
+                return
+            group_rank_map.clear()
+            for pod in current_pods:
+                g = _read_group_rank(pod)
+                if g is not None:
+                    group_rank_map[pod] = g
 
         def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
-            """True only for (index 0, local 0) and (index num_nodes//2+1, local nproc-1)."""
+            """True only for RANK 0 and RANK world_size-1.
+
+            Uses the resolved GROUP_RANK map; before the workers come up (empty
+            map) falls back to the completion-index-0 pod for early setup output.
+            """
             pod_match = pod_re.search(log_line)
             local_match = local_re.search(log_line)
             if not pod_match or not local_match:
                 return False
-            node = pod_index.get(pod_match.group(1))
-            if node is None:
-                return False
+            pod = pod_match.group(1)
             local = int(local_match.group(1))
-            return (node == 0 and local == 0) or (node == last_node and local == nproc - 1)
+            gr = group_rank_map.get(pod)
+            if gr is not None:
+                return (gr == 0 and local == 0) or (gr == last_group_rank and local == nproc - 1)
+            return pod_index.get(pod) == 0 and local == 0
 
         all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out")
         os.makedirs(self.job_dir, exist_ok=True)
@@ -543,6 +580,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             since_time: Optional[str] = None
             while True:
                 pod_index = _pod_index_map()
+                _ensure_group_ranks(set(pod_index))
                 attempt_cmd = base_cmd + ["--timestamps", "-f"]
                 # First attach replays history (--tail=-1); reconnects resume from
                 # the last seen timestamp so re-attaching never re-emits old lines.
@@ -578,6 +616,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                 time.sleep(2)
         else:
             pod_index = _pod_index_map()
+            _ensure_group_ranks(set(pod_index))
             result = subprocess.run(
                 base_cmd + ["--tail", "-1"], capture_output=True, text=True, timeout=timeout
             )

From 213ba39e41b273620f8b9988d603c529caa154ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 13:20:14 +0000
Subject: [PATCH 13/20] fix(kubeflow): emit forwarded log lines in timestamp
 order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'kubectl logs -l ... -f' multiplexes every pod into one stream in ARRIVAL
order, not timestamp order. Because the console forwards two pods (rank 0 and
the last rank), their lines could interleave wrong — e.g. two rank-0 'Step
Time' lines bunching before the last rank's 'iteration N' line, or a step time
landing under the next iteration.

Add a small reorder buffer on the forwarded (yielded) subset only: each line
already carries the kubelet --timestamps value (parsed to epoch via the new
_ts_epoch), so hold lines until they are older than reorder_hold_s (2s) and
emit sorted by timestamp. The window comfortably absorbs cross-node clock skew
+ flush jitter while keeping the console near-live. The buffer is drained in
order after each proc ends (re-attach) — outside finally, since yielding during
generator close is unsafe. The full all-ranks debug file is untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 40 ++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 3e287f72..d8cf975d 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import calendar
 import getpass
 import json
 import logging
@@ -485,6 +486,17 @@ def _split_ts(line: str) -> tuple[Optional[str], str]:
                 return None, line
             return m.group(2), m.group(1) + " " + line[m.end() :]
 
+        epoch_re = re.compile(r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)(?:\.(\d+))?Z?$")
+
+        def _ts_epoch(ts: str) -> Optional[float]:
+            """RFC3339 UTC stamp (kubectl --timestamps, ns precision) → epoch seconds."""
+            m = epoch_re.match(ts)
+            if not m:
+                return None
+            y, mo, d, h, mi, s = (int(m.group(i)) for i in range(1, 7))
+            frac = float("0." + m.group(7)) if m.group(7) else 0.0
+            return calendar.timegm((y, mo, d, h, mi, s, 0, 0, 0)) + frac
+
         nproc = self.nproc_per_node()
         pod_re = re.compile(r"pod/([^/]+)/")
         local_re = re.compile(r"\[default(\d+)\]")
@@ -577,6 +589,13 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
 
         if stream:
             reattach_interval_s = 120.0
+            # `kubectl logs -l ... -f` multiplexes pods in ARRIVAL order, so the two
+            # forwarded streams (rank 0 and the last rank, on different pods) can
+            # interleave out of timestamp order. Hold each forwarded line in a small
+            # buffer and emit sorted by the kubelet --timestamps value once it is
+            # older than REORDER_HOLD_S — long enough to absorb cross-node clock
+            # skew + flush jitter, short enough to keep the console near-live.
+            reorder_hold_s = 2.0
             since_time: Optional[str] = None
             while True:
                 pod_index = _pod_index_map()
@@ -593,6 +612,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                 # the reconnect from replaying history.
                 reattach_timer = threading.Timer(reattach_interval_s, proc.terminate)
                 reattach_timer.start()
+                reorder_buf: list[tuple[float, str]] = []
                 try:
                     with open(all_ranks_path, "a") as all_ranks_file:
                         for raw in iter(proc.stdout.readline, ""):
@@ -602,14 +622,32 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                             if ts is not None and (since_time is None or ts > since_time):
                                 since_time = ts
                             all_ranks_file.write(line)
-                            if _forward_to_stdout(line, pod_index):
+                            if not _forward_to_stdout(line, pod_index):
+                                continue
+                            ep = _ts_epoch(ts) if ts else None
+                            if ep is None:
                                 yield line
+                                continue
+                            reorder_buf.append((ep, line))
+                            reorder_buf.sort(key=lambda x: x[0])
+                            cutoff = ep - reorder_hold_s
+                            ready = 0
+                            while ready < len(reorder_buf) and reorder_buf[ready][0] <= cutoff:
+                                ready += 1
+                            for _, ready_line in reorder_buf[:ready]:
+                                yield ready_line
+                            del reorder_buf[:ready]
                 except Exception as e:
                     logger.warning("Error streaming logs: %s; retrying", e)
                 finally:
                     reattach_timer.cancel()
                     proc.terminate()
                     proc.wait(timeout=2)
+                # Flush the rest in timestamp order before re-attaching (yielding in
+                # finally is unsafe on generator close, so drain here).
+                reorder_buf.sort(key=lambda x: x[0])
+                for _, ready_line in reorder_buf:
+                    yield ready_line
                 state = self.status(job_name)
                 if state in (KubeflowJobState.SUCCEEDED, KubeflowJobState.FAILED):
                     break  # job reached a terminal state, stop streaming

From 981e6f9e28001807132beb724ae2227fbb2da1b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 15:16:40 +0000
Subject: [PATCH 14/20] feat(kubeflow): support pod-template annotations/labels
 (podTemplateOverrides metadata)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The executor's existing 'annotations' land on the TrainJob object. GKE multi-network
attach (networking.gke.io/interfaces, for GPUDirect-RDMA/gIB) is read off the trainer
POD, not the TrainJob — add pod_annotations (and pod_labels) that flow into
podTemplateOverrides[].metadata, which the Kubeflow Trainer v2 CRD supports.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index d8cf975d..cd033c1b 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -93,6 +93,11 @@ class KubeflowExecutor(Executor):
     volumes: list[dict[str, Any]] = field(default_factory=list)
     labels: dict[str, Any] = field(default_factory=dict)
     annotations: dict[str, Any] = field(default_factory=dict)
+    # pod_annotations land on the trainer POD template (podTemplateOverrides[].metadata),
+    # not the TrainJob object — needed for e.g. GKE multi-network attach
+    # (networking.gke.io/interfaces) which is read off the pod, not the TrainJob.
+    pod_annotations: dict[str, Any] = field(default_factory=dict)
+    pod_labels: dict[str, Any] = field(default_factory=dict)
     tolerations: list[dict[str, Any]] = field(default_factory=list)
     affinity: dict[str, Any] = field(default_factory=dict)
     # env_list accepts full env var dicts (e.g. valueFrom/secretKeyRef).
@@ -267,10 +272,18 @@ def get_job_body(self, name: str, command: list[str]) -> dict:
             "runtimeRef": {"name": self.runtime_ref},
             "trainer": trainer,
         }
-        if pod_spec_override:
-            spec["podTemplateOverrides"] = [
-                {"targetJobs": [{"name": "node"}], "spec": pod_spec_override}
-            ]
+        if pod_spec_override or self.pod_annotations or self.pod_labels:
+            override_entry: dict[str, Any] = {"targetJobs": [{"name": "node"}]}
+            if pod_spec_override:
+                override_entry["spec"] = pod_spec_override
+            pod_meta: dict[str, Any] = {}
+            if self.pod_labels:
+                pod_meta["labels"] = self.pod_labels
+            if self.pod_annotations:
+                pod_meta["annotations"] = self.pod_annotations
+            if pod_meta:
+                override_entry["metadata"] = pod_meta
+            spec["podTemplateOverrides"] = [override_entry]
         spec.update(self.spec_kwargs)
 
         metadata: dict[str, Any] = {"name": name, "namespace": self.namespace}

From c23cecf7d5de6298307de4e1c52f0b71d02a1974 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 31 May 2026 22:12:30 +0000
Subject: [PATCH 15/20] fix(kubeflow): resolve rank-0 and last rank before
 forwarding logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On first attach the GROUP_RANK pod map is empty until the torchrun workers
finish rendezvous, so _forward_to_stdout fell back to rank-0-only and the
last rank's early per-step loss/throughput lines (replayed via --tail=-1)
were written to log-allranks but never forwarded to stdout — the CI log
silently dropped the beginning of the run until a re-attach ~120s later,
by which point --since-time skips the replayed history.

Poll on the first attach until both rank 0 and the last rank resolve before
forwarding, capped at 600s (then fall back). The wait is gated on a
non-empty pod list, so it is a no-op when pods can't be listed (no kubectl
/ unit tests) and engages only for real runs.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 30 +++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index cd033c1b..04a32b65 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -609,10 +609,40 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             # older than REORDER_HOLD_S — long enough to absorb cross-node clock
             # skew + flush jitter, short enough to keep the console near-live.
             reorder_hold_s = 2.0
+            # First attach: resolve BOTH rank 0 and the last rank before forwarding
+            # any line. GROUP_RANK is only readable once the torchrun workers have
+            # rendezvoused, so the map is empty at first and _forward_to_stdout would
+            # fall back to rank-0-only — the last rank's early per-step loss lines
+            # (replayed via --tail=-1) would land in log-allranks but never reach
+            # stdout, silently dropping the beginning of the run from the CI log.
+            # Poll until both are resolved, capped so a run that never exposes
+            # GROUP_RANK still streams (with the completion-index fallback).
+            rank_resolve_timeout_s = 600.0
+            rank_resolve_poll_s = 5.0
             since_time: Optional[str] = None
             while True:
                 pod_index = _pod_index_map()
                 _ensure_group_ranks(set(pod_index))
+                if since_time is None:
+                    # First attach: wait until BOTH rank 0 and the last rank are
+                    # resolved before forwarding, so the last rank's early per-step
+                    # lines (replayed via --tail=-1) reach stdout instead of only
+                    # log-allranks. Only wait while pods are actually listable; an
+                    # empty list (no kubectl / unit tests) skips the wait and streams
+                    # with the existing completion-index fallback.
+                    resolve_deadline = time.time() + rank_resolve_timeout_s
+                    while pod_index and not {0, last_group_rank} <= set(group_rank_map.values()):
+                        if time.time() >= resolve_deadline:
+                            logger.warning(
+                                "rank 0 / last rank (%d) not both resolved within %.0fs; "
+                                "forwarding with completion-index fallback",
+                                last_group_rank,
+                                rank_resolve_timeout_s,
+                            )
+                            break
+                        time.sleep(rank_resolve_poll_s)
+                        pod_index = _pod_index_map()
+                        _ensure_group_ranks(set(pod_index))
                 attempt_cmd = base_cmd + ["--timestamps", "-f"]
                 # First attach replays history (--tail=-1); reconnects resume from
                 # the last seen timestamp so re-attaching never re-emits old lines.

From 2b344dc553cc98e3244b04415c334cb213c259c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 1 Jun 2026 06:59:36 +0000
Subject: [PATCH 16/20] fix(kubeflow): wait for rank-0/last to resolve, never
 fall back to completion-index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first-attach barrier capped the wait at 600s and then forwarded with the
completion-index heuristic, which streams the wrong rank. A job can legitimately
sit Pending (starved for nodes) far longer than 600s, so it would time out and
mis-forward. Drop the timeout/fallback: keep polling while the job is alive and
stop only when it reaches a terminal state. --tail=-1 on first attach replays
history, so waiting loses nothing.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 04a32b65..a6e28a36 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -617,7 +617,6 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             # stdout, silently dropping the beginning of the run from the CI log.
             # Poll until both are resolved, capped so a run that never exposes
             # GROUP_RANK still streams (with the completion-index fallback).
-            rank_resolve_timeout_s = 600.0
             rank_resolve_poll_s = 5.0
             since_time: Optional[str] = None
             while True:
@@ -627,18 +626,17 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                     # First attach: wait until BOTH rank 0 and the last rank are
                     # resolved before forwarding, so the last rank's early per-step
                     # lines (replayed via --tail=-1) reach stdout instead of only
-                    # log-allranks. Only wait while pods are actually listable; an
-                    # empty list (no kubectl / unit tests) skips the wait and streams
-                    # with the existing completion-index fallback.
-                    resolve_deadline = time.time() + rank_resolve_timeout_s
+                    # log-allranks. Never fall back to the completion-index heuristic
+                    # — it forwards the wrong rank. The job may sit Pending (waiting
+                    # for nodes) or be mid-rendezvous, so keep waiting while it is
+                    # alive; --tail=-1 on first attach replays history, so nothing is
+                    # lost by waiting. Stop only if the job reaches a terminal state
+                    # (or pods aren't listable at all — e.g. no kubectl / unit tests).
                     while pod_index and not {0, last_group_rank} <= set(group_rank_map.values()):
-                        if time.time() >= resolve_deadline:
-                            logger.warning(
-                                "rank 0 / last rank (%d) not both resolved within %.0fs; "
-                                "forwarding with completion-index fallback",
-                                last_group_rank,
-                                rank_resolve_timeout_s,
-                            )
+                        if self.status(job_name) in (
+                            KubeflowJobState.SUCCEEDED,
+                            KubeflowJobState.FAILED,
+                        ):
                             break
                         time.sleep(rank_resolve_poll_s)
                         pod_index = _pod_index_map()

From 2870f128d0802bfbb1fdfa50276031263d7077c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 1 Jun 2026 10:15:57 +0000
Subject: [PATCH 17/20] style(kubeflow): ruff-format kubeflow.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index a6e28a36..8599e612 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -315,7 +315,9 @@ def _trainjob_name(self, fallback: str) -> str:
         cached = getattr(self, "_k8s_job_name", None)
         if cached is not None:
             return cached
-        base = re.sub(r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower()).strip("-")
+        base = re.sub(
+            r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower()
+        ).strip("-")
         uid = uuid.uuid4().hex[:6]
         base = base[: 33 - len(uid) - 1].strip("-") or "job"
         self._k8s_job_name = f"{base}-{uid}"
@@ -518,7 +520,17 @@ def _pod_index_map() -> dict[str, int]:
             """Map pod name → job-completion-index (== torchrun node rank)."""
             try:
                 out = subprocess.run(
-                    ["kubectl", "get", "pods", "-n", self.namespace, "-l", label_selector, "-o", "json"],
+                    [
+                        "kubectl",
+                        "get",
+                        "pods",
+                        "-n",
+                        self.namespace,
+                        "-l",
+                        label_selector,
+                        "-o",
+                        "json",
+                    ],
                     capture_output=True,
                     text=True,
                     timeout=timeout,
@@ -555,11 +567,23 @@ def _read_group_rank(pod: str) -> Optional[int]:
             script = (
                 "for e in /proc/[0-9]*/environ; do "
                 "g=$(tr '\\0' '\\n' < \"$e\" 2>/dev/null | grep -m1 '^GROUP_RANK='); "
-                "[ -n \"$g\" ] && { echo \"$g\"; break; }; done"
+                '[ -n "$g" ] && { echo "$g"; break; }; done'
             )
             try:
                 out = subprocess.run(
-                    ["kubectl", "exec", pod, "-n", self.namespace, "-c", "node", "--", "sh", "-c", script],
+                    [
+                        "kubectl",
+                        "exec",
+                        pod,
+                        "-n",
+                        self.namespace,
+                        "-c",
+                        "node",
+                        "--",
+                        "sh",
+                        "-c",
+                        script,
+                    ],
                     capture_output=True,
                     text=True,
                     timeout=min(timeout, 30),
@@ -644,9 +668,15 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
                 attempt_cmd = base_cmd + ["--timestamps", "-f"]
                 # First attach replays history (--tail=-1); reconnects resume from
                 # the last seen timestamp so re-attaching never re-emits old lines.
-                attempt_cmd += ["--tail", "-1"] if since_time is None else ["--since-time", since_time]
+                attempt_cmd += (
+                    ["--tail", "-1"] if since_time is None else ["--since-time", since_time]
+                )
                 proc = subprocess.Popen(
-                    attempt_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1
+                    attempt_cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.DEVNULL,
+                    text=True,
+                    bufsize=1,
                 )
                 # Force a periodic re-attach (terminate → reconnect) so pods that
                 # (re)started after this attach are picked up; --since-time keeps

From b6c3d8fb4b127d1fd608152fca69faab4cbe1713 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 1 Jun 2026 10:31:53 +0000
Subject: [PATCH 18/20] test(kubeflow): update stale tests for uuid names,
 idempotent 409, rank-0/last log forwarding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The base kubeflow rewrite changed behavior the tests still asserted the old way:
TrainJob names are now <base>-<uuid6>; a 409 cancels the stale job and recreates
(idempotent) rather than raising; and fetch_logs writes every rank to
<job_dir>/log-allranks_0.out while forwarding only rank-0 + the last rank to
stdout. Set job_dir, patch status/time.sleep to avoid the retry-loop hang, and
assert the all-ranks file + uuid-suffixed names.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 test/core/execution/test_kubeflow.py | 82 ++++++++++++++++++----------
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py
index eb406fa9..4f037a8b 100644
--- a/test/core/execution/test_kubeflow.py
+++ b/test/core/execution/test_kubeflow.py
@@ -243,7 +243,8 @@ def test_launch_success(self, executor, mock_k8s_clients):
         mock_custom.create_namespaced_custom_object.return_value = {}
 
         job_name, state = executor.launch("test-job", ["/bin/bash", "-c", "echo hi"])
-        assert job_name == "test-job"
+        # TrainJob names are <base>-<uuid6> (RFC-1123 safe, unique per launch)
+        assert job_name.startswith("test-job-") and len(job_name) == len("test-job-") + 6
         assert state == KubeflowJobState.CREATED
         mock_custom.create_namespaced_custom_object.assert_called_once()
 
@@ -272,12 +273,18 @@ def test_launch_wait_timeout(self, executor, mock_k8s_clients):
             with pytest.raises(RuntimeError, match="did not reach RUNNING"):
                 executor.launch("test-job", ["echo"], wait=True, timeout=-1)
 
-    def test_launch_conflict(self, executor, mock_k8s_clients):
+    def test_launch_conflict_recreates(self, executor, mock_k8s_clients):
         mock_custom, _ = mock_k8s_clients
-        mock_custom.create_namespaced_custom_object.side_effect = ApiException(status=409)
+        # A 409 means a stale TrainJob from a prior attempt lingers; launch cancels
+        # it and recreates so the caller's retry makes progress (idempotent launch).
+        mock_custom.create_namespaced_custom_object.side_effect = [ApiException(status=409), {}]
 
-        with pytest.raises(RuntimeError, match="already exists"):
-            executor.launch("test-job", ["/bin/bash", "-c", "echo hi"])
+        with patch.object(executor, "cancel") as mock_cancel:
+            _, state = executor.launch("test-job", ["/bin/bash", "-c", "echo hi"])
+
+        mock_cancel.assert_called_once()
+        assert mock_custom.create_namespaced_custom_object.call_count == 2
+        assert state == KubeflowJobState.CREATED
 
     def test_status_running(self, executor, mock_k8s_clients):
         mock_custom, _ = mock_k8s_clients
@@ -346,34 +353,38 @@ def test_cancel_with_wait_timeout(self, executor, mock_k8s_clients):
 
     # ── Logs ─────────────────────────────────────────────────────────────────────
 
-    def test_fetch_logs_no_follow(self, executor, mock_k8s_clients):
+    def test_fetch_logs_no_follow(self, executor, mock_k8s_clients, tmp_path):
+        executor.job_dir = str(tmp_path)
         with patch("subprocess.run") as mock_run:
             mock_run.return_value = MagicMock(stdout="line1\nline2\n")
-            lines = list(executor.fetch_logs("my-job", stream=False, lines=50))
-
-        mock_run.assert_called_once()
-        called_cmd = mock_run.call_args[0][0]
-        assert "--tail" in called_cmd
-        assert "50" in called_cmd
-        label_arg = " ".join(called_cmd)
-        assert "jobset.sigs.k8s.io/jobset-name=my-job" in label_arg
-        assert "-f" not in called_cmd
-        assert lines == ["line1", "line2"]
-
-    def test_fetch_logs_follow(self, executor, mock_k8s_clients):
+            list(executor.fetch_logs("my-job", stream=False, lines=50))
+
+        # the kubectl logs call (distinct from the pod-index lookup) targets the
+        # jobset and does not follow.
+        log_cmd = next(c.args[0] for c in mock_run.call_args_list if "logs" in c.args[0])
+        assert "jobset.sigs.k8s.io/jobset-name=my-job" in " ".join(log_cmd)
+        assert "--tail" in log_cmd and "-f" not in log_cmd
+        # every rank is persisted to the all-ranks log
+        assert (tmp_path / "log-allranks_0.out").read_text() == "line1\nline2\n"
+
+    def test_fetch_logs_follow(self, executor, mock_k8s_clients, tmp_path):
         import io
 
+        executor.job_dir = str(tmp_path)
         mock_proc = MagicMock()
         mock_proc.stdout = io.StringIO("line1\nline2\n")
         mock_proc.poll.return_value = None  # still running; loop exits when readline() hits EOF
 
-        with patch("subprocess.Popen", return_value=mock_proc) as mock_popen:
-            lines = list(executor.fetch_logs("my-job", stream=True, lines=100))
+        with (
+            patch("subprocess.Popen", return_value=mock_proc) as mock_popen,
+            patch("time.sleep"),
+            patch.object(executor, "status", return_value=KubeflowJobState.SUCCEEDED),
+        ):
+            list(executor.fetch_logs("my-job", stream=True, lines=100))
 
-        mock_popen.assert_called_once()
-        called_cmd = mock_popen.call_args[0][0]
-        assert "-f" in called_cmd
-        assert lines == ["line1\n", "line2\n"]
+        assert "-f" in mock_popen.call_args.args[0]
+        # every rank is persisted to the all-ranks log
+        assert (tmp_path / "log-allranks_0.out").read_text() == "line1\nline2\n"
 
     def test_status_unknown_when_empty(self, mock_k8s_clients):
         mock_custom, _ = mock_k8s_clients
@@ -473,10 +484,14 @@ def test_pull_results_syncs_from_pvc(self, workdir_executor, mock_k8s_clients):
         mock_core.create_namespaced_pod.assert_called_once()
         assert mock_check_call.call_count == 1  # kubectl cp only (no mkdir for pull)
         cp_args = mock_check_call.call_args[0][0]
-        # kubectl cp <ns>/<pod>:<remote> <local>
+        # kubectl cp <ns>/<pod>:<remote> <local>; the data-mover pod is named off the
+        # <base>-<uuid6> TrainJob name.
         assert "kubectl" in cp_args
         assert "cp" in cp_args
-        assert f"test-job-data-mover:{workdir_executor.code_dir}" in cp_args
+        dm = next(a for a in cp_args if "-data-mover:" in a)
+        assert dm.startswith("test-job-") and dm.endswith(
+            f"-data-mover:{workdir_executor.code_dir}"
+        )
 
     def test_pull_results_noop_without_workdir_pvc(self, mock_k8s_clients):
         e = KubeflowExecutor(image="test:latest")
@@ -590,11 +605,14 @@ def test_launch_wait_exits_on_failed(self, executor, mock_k8s_clients):
 
     # ── fetch_logs streaming: retry until terminal state ─────────────────────
 
-    def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s_clients):
+    def test_fetch_logs_stream_retries_until_terminal_state(
+        self, executor, mock_k8s_clients, tmp_path
+    ):
         """First Popen yields nothing and job is RUNNING; second yields a line and job is
         SUCCEEDED — loop exits on terminal status."""
         import io
 
+        executor.job_dir = str(tmp_path)
         empty_proc = MagicMock()
         empty_proc.stdout = io.StringIO("")
         empty_proc.poll.return_value = None
@@ -607,6 +625,8 @@ def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s
 
         with (
             patch("subprocess.Popen", side_effect=[empty_proc, output_proc]),
+            # no pods listable -> the rank-resolve barrier is a no-op (hermetic: no real kubectl)
+            patch("subprocess.run", return_value=MagicMock(stdout='{"items": []}')),
             patch("time.sleep"),
             patch.object(
                 executor,
@@ -614,13 +634,15 @@ def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s
                 side_effect=[KubeflowJobState.RUNNING, KubeflowJobState.SUCCEEDED],
             ),
         ):
-            lines = list(executor.fetch_logs("my-job", stream=True))
+            list(executor.fetch_logs("my-job", stream=True))
 
-        assert "some output\n" in lines
+        # forwarded stdout is rank-0/last only, but every rank lands in the all-ranks log
+        assert "some output" in (tmp_path / "log-allranks_0.out").read_text()
 
-    def test_fetch_logs_stream_handles_exception(self, executor, mock_k8s_clients):
+    def test_fetch_logs_stream_handles_exception(self, executor, mock_k8s_clients, tmp_path):
         """Exception inside the readline loop is caught; loop exits when job is terminal."""
 
+        executor.job_dir = str(tmp_path)
         mock_proc = MagicMock()
         mock_proc.stdout.readline.side_effect = OSError("read error")
         mock_proc.poll.return_value = None

From 4e9346f5c126f248536063c54b469782afc56e48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 1 Jun 2026 10:50:36 +0000
Subject: [PATCH 19/20] test(kubeflow): cover GROUP_RANK resolution, log
 forwarding, client reload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Raises codecov/patch on the diff from 58% to ~98% by exercising the
previously-untested branches: GROUP_RANK resolution via worker environ
(incl. the first-attach resolve barrier), rank-0/last-rank forwarding +
reorder buffer, the completion-index fallback, pod-template labels/
annotations, stale kube-client reload, and the status() connection-error
retry path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 test/core/execution/test_kubeflow.py | 164 ++++++++++++++++++++++++++-
 1 file changed, 163 insertions(+), 1 deletion(-)

diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py
index 4f037a8b..278bc20f 100644
--- a/test/core/execution/test_kubeflow.py
+++ b/test/core/execution/test_kubeflow.py
@@ -18,7 +18,11 @@
 import pytest
 from kubernetes.client.rest import ApiException
 
-from nemo_run.core.execution.kubeflow import KubeflowExecutor, KubeflowJobState
+from nemo_run.core.execution.kubeflow import (
+    _KUBE_CLIENT_REFRESH_SECONDS,
+    KubeflowExecutor,
+    KubeflowJobState,
+)
 
 
 class TestKubeflowExecutor:
@@ -873,3 +877,161 @@ def test_lookup_job_dir_returns_empty_on_exception(self, mock_k8s_clients):
         with patch("nemo_run.config.get_nemorun_home", side_effect=Exception("boom")):
             result = e._lookup_job_dir("test-job")
         assert result == ""
+
+    # ── get_job_body(): pod-template labels / annotations ─────────────────────
+
+    def test_get_trainjob_body_pod_labels_and_annotations(self, mock_k8s_clients):
+        e = KubeflowExecutor(
+            image="test:latest",
+            pod_labels={"nemo-ci/job-id": "42"},
+            pod_annotations={"sidecar.istio.io/inject": "false"},
+        )
+        body = e.get_job_body("pod-labeled", ["echo"])
+        meta = body["spec"]["podTemplateOverrides"][0]["metadata"]
+        assert meta["labels"] == {"nemo-ci/job-id": "42"}
+        assert meta["annotations"] == {"sidecar.istio.io/inject": "false"}
+
+    # ── _maybe_reload_kube_clients(): rebuild after the refresh interval ──────
+
+    def test_maybe_reload_kube_clients_rebuilds_when_stale(self, executor):
+        import time
+
+        # Anchor relative to now (monotonic()'s epoch is arbitrary / uptime-based),
+        # so age exceeds the refresh interval regardless of the runner's uptime.
+        executor._kube_clients_loaded_at = time.monotonic() - (_KUBE_CLIENT_REFRESH_SECONDS + 1)
+        with patch.object(executor, "_load_kube_clients") as mock_reload:
+            _ = executor._custom_objects_api
+        mock_reload.assert_called_once()
+
+    def test_maybe_reload_kube_clients_skips_when_fresh(self, executor):
+        import time
+
+        executor._kube_clients_loaded_at = time.monotonic()
+        with patch.object(executor, "_load_kube_clients") as mock_reload:
+            _ = executor._core_v1_api
+        mock_reload.assert_not_called()
+
+    # ── status(): reload the kube client once on a non-API connection error ───
+
+    def test_status_reloads_kube_client_on_connection_error(self, executor, mock_k8s_clients):
+        mock_custom, _ = mock_k8s_clients
+        mock_custom.get_namespaced_custom_object.side_effect = [
+            RuntimeError("expired client cert"),
+            {"status": {"jobsStatus": [{"active": 3, "ready": 3, "succeeded": 0, "failed": 0}]}},
+        ]
+        with patch.object(executor, "_load_kube_clients") as mock_reload:
+            state = executor.status("my-job")
+        mock_reload.assert_called_once()
+        assert state == KubeflowJobState.RUNNING
+
+    def test_status_returns_none_when_reload_does_not_help(self, executor, mock_k8s_clients):
+        mock_custom, _ = mock_k8s_clients
+        mock_custom.get_namespaced_custom_object.side_effect = RuntimeError("still broken")
+        with patch.object(executor, "_load_kube_clients"):
+            assert executor.status("my-job") is None
+
+    # ── fetch_logs(stream): resolve GROUP_RANK, forward rank-0 + last only ────
+
+    def test_fetch_logs_stream_resolves_group_ranks_and_forwards(
+        self, executor, mock_k8s_clients, tmp_path
+    ):
+        """End-to-end stream: pods resolve their GROUP_RANK from worker environ
+        (after a first empty sweep that exercises the resolve barrier), and only
+        rank-0 + the last global rank reach stdout while every rank is persisted."""
+        import io
+        import json
+
+        executor.job_dir = str(tmp_path)
+        group_rank = {"pod-0": 0, "pod-1": 1, "pod-2": 2}
+        pods_json = json.dumps(
+            {
+                "items": [
+                    {
+                        "metadata": {
+                            "name": p,
+                            "labels": {"batch.kubernetes.io/job-completion-index": str(i)},
+                        }
+                    }
+                    for i, p in enumerate(group_rank)
+                ]
+            }
+        )
+        exec_calls = {"n": 0}
+
+        def fake_run(cmd, *args, **kwargs):
+            if "exec" in cmd:
+                exec_calls["n"] += 1
+                if exec_calls["n"] <= len(group_rank):  # first sweep: workers not up yet
+                    return MagicMock(stdout="")
+                return MagicMock(stdout=f"GROUP_RANK={group_rank[cmd[2]]}\n")
+            return MagicMock(stdout=pods_json)  # kubectl get pods -o json
+
+        stream = io.StringIO(
+            "[pod/pod-0/node] 2026-06-01T10:00:01.000000000Z [default0]: rank0-step\n"
+            "[pod/pod-1/node] 2026-06-01T10:00:01.500000000Z [default3]: mid-rank\n"
+            "[pod/pod-2/node] 2026-06-01T10:00:02.000000000Z [default7]: lastrank-step\n"
+            "[pod/pod-0/node] [default0]: no-timestamp-line\n"
+        )
+        proc = MagicMock()
+        proc.stdout = stream
+
+        with (
+            patch("subprocess.run", side_effect=fake_run),
+            patch("subprocess.Popen", return_value=proc),
+            patch("time.sleep"),
+            patch.object(
+                executor,
+                "status",
+                side_effect=[KubeflowJobState.RUNNING, KubeflowJobState.SUCCEEDED],
+            ),
+        ):
+            forwarded = "".join(executor.fetch_logs("my-job", stream=True))
+
+        assert exec_calls["n"] >= 2 * len(group_rank)  # both resolve sweeps ran
+        assert "rank0-step" in forwarded  # GROUP_RANK 0, local 0
+        assert "lastrank-step" in forwarded  # GROUP_RANK 2 (== num_nodes-1), local 7 (== nproc-1)
+        assert "no-timestamp-line" in forwarded  # forwarded immediately (no reorder buffer)
+        assert "mid-rank" not in forwarded  # neither rank-0 nor last rank
+        all_ranks = (tmp_path / "log-allranks_0.out").read_text()
+        for marker in ("rank0-step", "mid-rank", "lastrank-step", "no-timestamp-line"):
+            assert marker in all_ranks
+
+    def test_fetch_logs_no_follow_forwards_rank0_via_completion_index(
+        self, executor, mock_k8s_clients, tmp_path
+    ):
+        """When GROUP_RANK is unreadable, fall back to the completion-index-0 pod
+        for early setup output; the last rank is not forwarded without GROUP_RANK."""
+        import json
+
+        executor.job_dir = str(tmp_path)
+        pods_json = json.dumps(
+            {
+                "items": [
+                    {
+                        "metadata": {
+                            "name": f"pod-{i}",
+                            "labels": {"batch.kubernetes.io/job-completion-index": str(i)},
+                        }
+                    }
+                    for i in range(3)
+                ]
+            }
+        )
+
+        def fake_run(cmd, *args, **kwargs):
+            if "exec" in cmd:
+                return MagicMock(stdout="")  # GROUP_RANK not resolvable → completion-index fallback
+            if "logs" in cmd:
+                return MagicMock(
+                    stdout="[pod/pod-0/node] [default0]: setup-output\n"
+                    "[pod/pod-2/node] [default7]: last-output\n"
+                )
+            return MagicMock(stdout=pods_json)
+
+        with patch("subprocess.run", side_effect=fake_run):
+            forwarded = "".join(executor.fetch_logs("my-job", stream=False))
+
+        assert "setup-output" in forwarded  # completion-index-0 fallback
+        assert "last-output" not in forwarded  # no GROUP_RANK → last rank not forwarded
+        all_ranks = (tmp_path / "log-allranks_0.out").read_text()
+        assert "setup-output" in all_ranks and "last-output" in all_ranks

From 71461c5969b442cad4b5101bf52c82042267768c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 2 Jun 2026 07:30:15 +0000
Subject: [PATCH 20/20] feat(kubeflow): add
 copy_to_workspace/copy_from_workspace for arbitrary PVC paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

package()/pull_results() already bridge launcher↔PVC via a throw-away data-mover
pod, but only for the per-job code_dir. Downloading results (or persisting any
auxiliary cross-run state) from another path on the volume had no public API.

Add copy_to_workspace(local, remote) and copy_from_workspace(remote, local) that
run the same data-mover against an arbitrary path under workdir_pvc_path, and
refactor package()/pull_results() to delegate to them (behavior unchanged). Tests
cover the happy path, the no-PVC no-op, and pod teardown on copy error.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 nemo_run/core/execution/kubeflow.py  | 63 ++++++++++++++++++++++------
 test/core/execution/test_kubeflow.py | 43 +++++++++++++++++++
 2 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
index 8599e612..525a7c0d 100644
--- a/nemo_run/core/execution/kubeflow.py
+++ b/nemo_run/core/execution/kubeflow.py
@@ -937,6 +937,55 @@ def materialize_launch_script(self, cmd: list[str], max_retries: int = 0) -> Non
             f.write(script)
         logger.info("Wrote launch script to %s", launch_script_path)
 
+    def copy_to_workspace(
+        self, local_path: str, remote_path: str, label: str = "datamover"
+    ) -> None:
+        """Copy *local_path* (a directory) to *remote_path* on the workdir PVC.
+
+        Generalizes :meth:`package`'s PVC sync to an arbitrary path on the volume —
+        not just the per-job ``code_dir`` — so callers can persist auxiliary
+        cross-run state (e.g. a metrics cache) anywhere under ``workdir_pvc_path``
+        via the same throw-away data-mover pod. No-op when ``workdir_pvc`` is unset.
+
+        Args:
+            local_path: Local directory whose contents are copied.
+            remote_path: Destination directory on the workdir PVC.
+            label: Disambiguates the data-mover pod name across concurrent transfers.
+        """
+        if not self.workdir_pvc:
+            return
+        pod_name = self._data_mover_pod_name(label)
+        self._start_data_mover_pod(pod_name)
+        try:
+            self._rsync_to_pod(pod_name, local_path, remote_path)
+        finally:
+            self._delete_data_mover_pod(pod_name)
+
+    def copy_from_workspace(
+        self, remote_path: str, local_path: str, label: str = "datamover"
+    ) -> None:
+        """Copy *remote_path* from the workdir PVC to *local_path*.
+
+        Generalizes :meth:`pull_results` to an arbitrary path on the volume — not
+        just the per-job ``code_dir`` — so callers can read auxiliary cross-run
+        state via the same throw-away data-mover pod. No-op when ``workdir_pvc`` is
+        unset. Propagates the underlying ``kubectl cp`` error when *remote_path*
+        does not exist; callers that treat absence as normal should handle it.
+
+        Args:
+            remote_path: Source directory on the workdir PVC.
+            local_path: Local destination directory.
+            label: Disambiguates the data-mover pod name across concurrent transfers.
+        """
+        if not self.workdir_pvc:
+            return
+        pod_name = self._data_mover_pod_name(label)
+        self._start_data_mover_pod(pod_name)
+        try:
+            self._rsync_from_pod(pod_name, remote_path, local_path)
+        finally:
+            self._delete_data_mover_pod(pod_name)
+
     def package(self, packager: Packager, job_name: str) -> None:
         """Sync job_dir to the workdir PVC via a temporary data-mover pod before launch.
 
@@ -963,12 +1012,7 @@ def package(self, packager: Packager, job_name: str) -> None:
         # Sync job_dir to <workdir_pvc_path>/<username>/code on the PVC via a
         # throw-away data-mover pod.  Scoping to a user subdirectory means we
         # never clobber other data already on the shared volume.
-        pod_name = self._data_mover_pod_name(job_name)
-        self._start_data_mover_pod(pod_name)
-        try:
-            self._rsync_to_pod(pod_name, self.job_dir, self.code_dir)
-        finally:
-            self._delete_data_mover_pod(pod_name)
+        self.copy_to_workspace(self.job_dir, self.code_dir, label=job_name)
 
         # Mount the PVC so the training container can reach code_dir.
         # If the PVC is already declared (e.g. explicitly by the caller for data),
@@ -1009,12 +1053,7 @@ def pull_results(self, job_name: str, dest_dir: Optional[str] = None) -> None:
                 "Pass dest_dir explicitly or call via an executor that has job_dir set."
             )
 
-        pod_name = self._data_mover_pod_name(job_name)
-        self._start_data_mover_pod(pod_name)
-        try:
-            self._rsync_from_pod(pod_name, self.code_dir, local_path)
-        finally:
-            self._delete_data_mover_pod(pod_name)
+        self.copy_from_workspace(self.code_dir, local_path, label=job_name)
 
     def _lookup_job_dir(self, job_name: str) -> str:
         """Look up the job_dir saved by the scheduler for *job_name*."""
diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py
index 278bc20f..8babcf57 100644
--- a/test/core/execution/test_kubeflow.py
+++ b/test/core/execution/test_kubeflow.py
@@ -1035,3 +1035,46 @@ def fake_run(cmd, *args, **kwargs):
         assert "last-output" not in forwarded  # no GROUP_RANK → last rank not forwarded
         all_ranks = (tmp_path / "log-allranks_0.out").read_text()
         assert "setup-output" in all_ranks and "last-output" in all_ranks
+
+    # ── copy_to_workspace / copy_from_workspace (arbitrary-path PVC sync) ─────
+
+    def test_copy_to_workspace_uses_data_mover(self, executor, mock_k8s_clients):
+        executor.workdir_pvc = "model-cache"
+        with (
+            patch.object(executor, "_start_data_mover_pod") as start,
+            patch.object(executor, "_rsync_to_pod") as rsync,
+            patch.object(executor, "_delete_data_mover_pod") as delete,
+        ):
+            executor.copy_to_workspace("/local/dir", "/nemo-workspace/remote", label="x")
+        start.assert_called_once()
+        delete.assert_called_once()
+        assert rsync.call_args.args[1:] == ("/local/dir", "/nemo-workspace/remote")
+
+    def test_copy_from_workspace_uses_data_mover(self, executor, mock_k8s_clients):
+        executor.workdir_pvc = "model-cache"
+        with (
+            patch.object(executor, "_start_data_mover_pod"),
+            patch.object(executor, "_rsync_from_pod") as rsync,
+            patch.object(executor, "_delete_data_mover_pod") as delete,
+        ):
+            executor.copy_from_workspace("/nemo-workspace/remote", "/local/dir")
+        assert rsync.call_args.args[1:] == ("/nemo-workspace/remote", "/local/dir")
+        delete.assert_called_once()
+
+    def test_copy_workspace_noop_without_pvc(self, executor, mock_k8s_clients):
+        executor.workdir_pvc = None
+        with patch.object(executor, "_start_data_mover_pod") as start:
+            executor.copy_to_workspace("/a", "/b")
+            executor.copy_from_workspace("/b", "/a")
+        start.assert_not_called()
+
+    def test_copy_from_workspace_cleans_up_pod_on_error(self, executor, mock_k8s_clients):
+        executor.workdir_pvc = "model-cache"
+        with (
+            patch.object(executor, "_start_data_mover_pod"),
+            patch.object(executor, "_rsync_from_pod", side_effect=RuntimeError("absent")),
+            patch.object(executor, "_delete_data_mover_pod") as delete,
+        ):
+            with pytest.raises(RuntimeError):
+                executor.copy_from_workspace("/nemo-workspace/missing", "/local/dir")
+        delete.assert_called_once()  # pod torn down even when the copy raises