From aa3a357875f830328a416a76fb1b1e758b357334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 10:30:40 +0000 Subject: [PATCH 01/20] fix(kubeflow): stream only rank 0 + last rank, write all ranks to disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KubeflowExecutor.fetch_logs followed every replica and forwarded all ranks to the caller, so at scale the aggregate output overran CI/runner job-log size limits (a 16-node x 8-GPU run exceeded GitLab's 128MB cap). Now it still tails every rank (kubectl logs -l --prefix --max-log-requests num_nodes) and writes the complete multi-rank output to /log-allranks_0.out, but forwards only global rank 0 (node 0, [default0]) and the last global rank (node num_nodes-1, [default nproc_per_node-1]) to stdout. Downstream log validation that globs log*.out still sees every rank via the on-disk file. Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 55 +++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index ea0d8cf0..5f87eb97 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -16,6 +16,7 @@ import getpass import logging import os +import re import subprocess import time from dataclasses import dataclass, field @@ -331,6 +332,13 @@ def fetch_logs( until pods are running (up to 10 minutes). Otherwise it returns the last *lines* lines from a single ``kubectl logs`` call. """ + # Tail every rank, but forward only global rank 0 and the last global + # rank to the caller (stdout / CI job log). Streaming all ranks at scale + # overruns CI/runner job-log size limits, yet the full multi-rank output + # is still written to /log-allranks_0.out so downstream log + # validation (which globs log*.out) sees every rank. --prefix tags each + # line with its pod name so we can recover the node (completion index) + # and pair it with torchrun's [defaultN] local-rank marker. label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}" cmd = [ "kubectl", @@ -339,11 +347,29 @@ def fetch_logs( label_selector, "-n", self.namespace, + "--prefix", "--tail", str(lines), "--max-log-requests", str(self.num_nodes), ] + last_node = max(self.num_nodes - 1, 0) + last_local = max(self.nproc_per_node() - 1, 0) + node_re = re.compile(r"node-0-(\d+)-") + local_re = re.compile(r"\[default(\d+)\]") + + def _forward_to_stdout(log_line: str) -> bool: + """True for global rank 0 (node 0, local 0) and the last global rank.""" + node_match = node_re.search(log_line) + local_match = local_re.search(log_line) + if not node_match or not local_match: + return False + node, local = int(node_match.group(1)), int(local_match.group(1)) + return (node == 0 and local == 0) or (node == last_node and local == last_local) + + all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") + os.makedirs(self.job_dir, exist_ok=True) + if stream: cmd.append("-f") # Retry kubectl logs -f until the job reaches a terminal state. @@ -354,16 +380,21 @@ def fetch_logs( ) lines_yielded = 0 try: - for line in iter(proc.stdout.readline, ""): - if line: - lines_yielded += 1 - yield line - if proc.poll() is not None: - for remaining in proc.stdout: - if remaining: + with open(all_ranks_path, "a") as all_ranks_file: + for line in iter(proc.stdout.readline, ""): + if line: + all_ranks_file.write(line) + if _forward_to_stdout(line): lines_yielded += 1 - yield remaining - break + yield line + if proc.poll() is not None: + for remaining in proc.stdout: + if remaining: + all_ranks_file.write(remaining) + if _forward_to_stdout(remaining): + lines_yielded += 1 + yield remaining + break except Exception as e: logger.warning("Error streaming logs: %s; retrying", e) finally: @@ -381,7 +412,11 @@ def fetch_logs( time.sleep(5) else: result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) - yield from result.stdout.splitlines() + with open(all_ranks_path, "a") as all_ranks_file: + for line in result.stdout.splitlines(): + all_ranks_file.write(line + "\n") + if _forward_to_stdout(line): + yield line def cancel( self, From 8e1930e2b577ffaac2b244d355aa13537e0e41e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 12:01:57 +0000 Subject: [PATCH 02/20] fix(kubeflow): resolve last pod via completion-index label + full-history streaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fetch_logs identified the last global rank's pod by parsing the pod name and tailed only the last `--tail ` window, so on (re)attach the last rank's mid-run canonical "iteration | lm loss | ..." line (print_rank_last) was dropped — on K8s the job log showed only rank 0's "Step Time" line. Resolve the first/last pod from the authoritative batch.kubernetes.io/job-completion-index label (== torchrun PET_NODE_RANK), mapped from the --prefix pod name and refreshed on every (re)connect (gang restarts spawn new pod names), and stream each pod's full history (--tail=-1) so no mid-run line is missed. All ranks are still written to log-allranks_0.out; only global rank 0 and the true last global rank are forwarded to stdout. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 79 +++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 5f87eb97..76b44d55 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -14,6 +14,7 @@ # limitations under the License. import getpass +import json import logging import os import re @@ -332,13 +333,22 @@ def fetch_logs( until pods are running (up to 10 minutes). Otherwise it returns the last *lines* lines from a single ``kubectl logs`` call. """ - # Tail every rank, but forward only global rank 0 and the last global - # rank to the caller (stdout / CI job log). Streaming all ranks at scale - # overruns CI/runner job-log size limits, yet the full multi-rank output - # is still written to /log-allranks_0.out so downstream log - # validation (which globs log*.out) sees every rank. --prefix tags each - # line with its pod name so we can recover the node (completion index) - # and pair it with torchrun's [defaultN] local-rank marker. + # Tail every rank to /log-allranks_0.out (downstream log + # validation globs log*.out and needs every rank), but forward only + # global rank 0 and the *last* global rank to the caller (stdout / CI + # job log) — streaming all ranks at scale overruns CI job-log limits. + # + # Identifying the last global rank requires the authoritative node rank, + # NOT the pod name. Kubeflow Trainer binds torchrun's PET_NODE_RANK to + # the indexed-Job completion index, stamped on each pod as the + # `batch.kubernetes.io/job-completion-index` label. So: + # global_rank = job_completion_index * nproc_per_node + local_rank + # `--prefix` tags each line with `[pod//]`; we map that + # pod name → completion index (refreshed on every (re)connect, since a + # gang restart spawns new pod names) and pair it with torchrun's + # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full + # history on (re)attach so mid-run lines are never dropped (the previous + # `--tail ` snapshot missed the last rank's per-step lines). label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}" cmd = [ "kubectl", @@ -349,22 +359,57 @@ def fetch_logs( self.namespace, "--prefix", "--tail", - str(lines), + "-1", "--max-log-requests", str(self.num_nodes), ] last_node = max(self.num_nodes - 1, 0) last_local = max(self.nproc_per_node() - 1, 0) - node_re = re.compile(r"node-0-(\d+)-") + pod_re = re.compile(r"pod/([^/]+)/") local_re = re.compile(r"\[default(\d+)\]") - def _forward_to_stdout(log_line: str) -> bool: + def _pod_index_map() -> dict[str, int]: + """Map pod name → job-completion-index (== torchrun node rank).""" + try: + out = subprocess.run( + [ + "kubectl", + "get", + "pods", + "-n", + self.namespace, + "-l", + label_selector, + "-o", + "json", + ], + capture_output=True, + text=True, + timeout=timeout, + ) + items = json.loads(out.stdout).get("items", []) + except Exception as e: + logger.warning("Could not list pods for %s: %s", job_name, e) + return {} + mapping: dict[str, int] = {} + for item in items: + meta = item.get("metadata", {}) + name = meta.get("name") + idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index") + if name is not None and idx is not None: + mapping[name] = int(idx) + return mapping + + def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: """True for global rank 0 (node 0, local 0) and the last global rank.""" - node_match = node_re.search(log_line) + pod_match = pod_re.search(log_line) local_match = local_re.search(log_line) - if not node_match or not local_match: + if not pod_match or not local_match: + return False + node = pod_index.get(pod_match.group(1)) + if node is None: return False - node, local = int(node_match.group(1)), int(local_match.group(1)) + local = int(local_match.group(1)) return (node == 0 and local == 0) or (node == last_node and local == last_local) all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") @@ -375,6 +420,7 @@ def _forward_to_stdout(log_line: str) -> bool: # Retry kubectl logs -f until the job reaches a terminal state. # This handles both pods not yet running and transient mid-stream failures. while True: + pod_index = _pod_index_map() proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 ) @@ -384,14 +430,14 @@ def _forward_to_stdout(log_line: str) -> bool: for line in iter(proc.stdout.readline, ""): if line: all_ranks_file.write(line) - if _forward_to_stdout(line): + if _forward_to_stdout(line, pod_index): lines_yielded += 1 yield line if proc.poll() is not None: for remaining in proc.stdout: if remaining: all_ranks_file.write(remaining) - if _forward_to_stdout(remaining): + if _forward_to_stdout(remaining, pod_index): lines_yielded += 1 yield remaining break @@ -411,11 +457,12 @@ def _forward_to_stdout(log_line: str) -> bool: ) time.sleep(5) else: + pod_index = _pod_index_map() result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) with open(all_ranks_path, "a") as all_ranks_file: for line in result.stdout.splitlines(): all_ranks_file.write(line + "\n") - if _forward_to_stdout(line): + if _forward_to_stdout(line, pod_index): yield line def cancel( From b2be85c564960afd9d0d5ebdda88727460e245f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 12:30:21 +0000 Subject: [PATCH 03/20] fix(kubeflow): forward by global rank (node_rank*nproc+local), not pod heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kubeflow Trainer sets torchrun PET_NODE_RANK statically from the JobSet batch.kubernetes.io/job-completion-index, so global_rank = completion_index * nproc_per_node + local_rank. Compute that explicitly and forward only global rank 0 and world_size-1 to stdout (all ranks still go to log-allranks_0.out). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 76b44d55..33106fc5 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -363,8 +363,8 @@ def fetch_logs( "--max-log-requests", str(self.num_nodes), ] - last_node = max(self.num_nodes - 1, 0) - last_local = max(self.nproc_per_node() - 1, 0) + nproc = self.nproc_per_node() + last_rank = max(self.num_nodes * nproc - 1, 0) pod_re = re.compile(r"pod/([^/]+)/") local_re = re.compile(r"\[default(\d+)\]") @@ -401,7 +401,14 @@ def _pod_index_map() -> dict[str, int]: return mapping def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: - """True for global rank 0 (node 0, local 0) and the last global rank.""" + """True for the first and last *global rank* only. + + Kubeflow Trainer sets torchrun's PET_NODE_RANK from the JobSet + completion-index label (static), so the global rank is + ``node_rank * nproc_per_node + local_rank`` where node_rank is the + pod's completion index and local_rank is torchrun's ``[defaultN]`` + marker. We forward global rank 0 and ``world_size - 1`` only. + """ pod_match = pod_re.search(log_line) local_match = local_re.search(log_line) if not pod_match or not local_match: @@ -409,8 +416,8 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: node = pod_index.get(pod_match.group(1)) if node is None: return False - local = int(local_match.group(1)) - return (node == 0 and local == 0) or (node == last_node and local == last_local) + global_rank = node * nproc + int(local_match.group(1)) + return global_rank == 0 or global_rank == last_rank all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") os.makedirs(self.job_dir, exist_ok=True) From 56bbb4bb1655bf515f607dea40b65b8015c8b684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 13:42:21 +0000 Subject: [PATCH 04/20] fix(kubeflow): make TrainJob launch idempotent on 409 conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a TrainJob with the target name already exists, launch() raised and aborted. On CI the name is derived from the experiment id (commit SHA), so a 409 is a stale leftover from a prior attempt the launcher declared FAILED after a slow pod start. That blocked setup_experiment's 'attempt N of M' retry — every retry re-collided. Now launch() deletes the stale job (cancel(wait=True)) and recreates, so the retry can actually recover. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 33106fc5..11a2b5f0 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -261,11 +261,25 @@ def launch( body=job_body, ) except ApiException as e: - if e.status == 409: - raise RuntimeError( - f"{_TRAINJOB_KIND} {name} already exists in namespace {self.namespace}" - ) from e - raise + if e.status != 409: + raise + # The job name is derived from the experiment id (the commit SHA on + # CI), so a 409 means a TrainJob from a prior attempt lingers — e.g. + # an attempt the launcher declared FAILED after a slow pod start. + # Delete the stale job and recreate so the caller's retry (such as + # setup_experiment's "attempt N of M") makes progress instead of + # re-colliding on the same name. + logger.warning( + "%s %s already exists; deleting stale job and recreating", _TRAINJOB_KIND, name + ) + self.cancel(name, wait=True) + self._custom_objects_api.create_namespaced_custom_object( + group=_TRAINJOB_GROUP, + version=_TRAINJOB_VERSION, + namespace=self.namespace, + plural=_TRAINJOB_PLURAL, + body=job_body, + ) logger.info("Submitted %s %s to namespace %s", _TRAINJOB_KIND, name, self.namespace) From 3f0f5b419c922f3980bafc68b437f6bd98873dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 22:17:16 +0000 Subject: [PATCH 05/20] fix(kubeflow): reload kube client across cert rotation for long runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kubernetes SDK bakes the client cert into its SSLContext at client construction and never re-reads it. When credentials come from a rotating source (Teleport tbot refreshing the cert on disk), a KubeflowExecutor created once at launch keeps presenting the original cert until it expires mid-run, so status polls fail with SSLV3_ALERT_CERTIFICATE_EXPIRED once the run outlives the cert TTL (~60 min). Short jobs finish in time; multi-hour jobs go blind. Rebuild the API clients from the on-disk kubeconfig past a refresh interval (below the cert TTL) via lazy properties, and reactively reload+retry once in status() on a non-API connection error. fetch_logs already shells out to kubectl, which re-reads creds per call, so it was unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 83 ++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 13 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 11a2b5f0..a122e361 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -43,6 +43,14 @@ _TRAINJOB_GROUP = "trainer.kubeflow.org" _TRAINJOB_VERSION = "v1alpha1" _TRAINJOB_PLURAL = "trainjobs" + +# The kubernetes SDK bakes the client cert into its SSLContext at construction +# time and never re-reads it. When credentials come from a rotating source +# (e.g. a Teleport tbot that refreshes the cert on disk), a long-running client +# keeps presenting the original cert until it expires mid-run. Rebuilding the +# API clients from the on-disk kubeconfig more frequently than the cert TTL +# keeps a long run (multi-hour jobs) authenticated across rotations. +_KUBE_CLIENT_REFRESH_SECONDS = 1500 _TRAINJOB_KIND = "TrainJob" @@ -109,6 +117,16 @@ def __post_init__(self): "kubernetes package is required for KubeflowExecutor. " "Install it with: pip install nemo-run[kubeflow]" ) + self._load_kube_clients() + + def _load_kube_clients(self) -> None: + """(Re)load the kubeconfig from disk and rebuild the API clients. + + Called at init and again whenever the cached clients age past + ``_KUBE_CLIENT_REFRESH_SECONDS`` (see the module constant) so that a + rotating client cert (e.g. a Teleport tbot refreshing it on disk) is + picked up before the in-memory cert expires. + """ try: config.load_kube_config() except Exception as original_exc: @@ -116,8 +134,27 @@ def __post_init__(self): config.load_incluster_config() except Exception: raise original_exc - self._custom_objects_api = client.CustomObjectsApi() - self._core_v1_api = client.CoreV1Api() + self._co_api = client.CustomObjectsApi() + self._cv_api = client.CoreV1Api() + self._kube_clients_loaded_at = time.monotonic() + + def _maybe_reload_kube_clients(self) -> None: + """Rebuild the API clients if they are older than the refresh interval.""" + age = time.monotonic() - getattr(self, "_kube_clients_loaded_at", 0.0) + if age >= _KUBE_CLIENT_REFRESH_SECONDS: + self._load_kube_clients() + + @property + def _custom_objects_api(self): + """CustomObjectsApi client, transparently refreshed across cert rotations.""" + self._maybe_reload_kube_clients() + return self._co_api + + @property + def _core_v1_api(self): + """CoreV1Api client, transparently refreshed across cert rotations.""" + self._maybe_reload_kube_clients() + return self._cv_api # ── Executor interface ──────────────────────────────────────────────────── @@ -306,18 +343,38 @@ def launch( def status(self, job_name: str) -> Optional[KubeflowJobState]: """Return the current state of *job_name*, or ``None`` if it no longer exists.""" - try: - resp = self._custom_objects_api.get_namespaced_custom_object( - group=_TRAINJOB_GROUP, - version=_TRAINJOB_VERSION, - namespace=self.namespace, - plural=_TRAINJOB_PLURAL, - name=job_name, - ) - except ApiException as e: - if e.status == 404: + resp = None + for attempt in range(2): + try: + resp = self._custom_objects_api.get_namespaced_custom_object( + group=_TRAINJOB_GROUP, + version=_TRAINJOB_VERSION, + namespace=self.namespace, + plural=_TRAINJOB_PLURAL, + name=job_name, + ) + break + except ApiException as e: + if e.status == 404: + return None + logger.warning("API error getting status for %s: %s", job_name, e) + return None + except Exception as e: + # Not an API-level error — most likely an expired client cert + # (tbot rotated it on disk but the SDK cached the old one) or a + # transient connection error. Force a client reload from the + # freshly-rotated kubeconfig and retry once. + if attempt == 0: + logger.warning( + "Connection error getting status for %s (%s); reloading kube client", + job_name, + e, + ) + self._load_kube_clients() + continue + logger.warning("Status check for %s failed after client reload: %s", job_name, e) return None - logger.warning("API error getting status for %s: %s", job_name, e) + if resp is None: return None job_status = resp.get("status", {}) From b597e6cf98ce2bdc842dd298aa32b8851ead6151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 22:30:10 +0000 Subject: [PATCH 06/20] fix(kubeflow): scope code_dir per job to avoid concurrent clobber MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit code_dir was scoped only per user (//code), but package() rsyncs each job's job_dir into it. Two concurrent jobs from the same user (e.g. parallel CI test cases) therefore overwrite each other's launcher code mid-run. Scope it per job (///code), matching how dgxcloud/lepton mirror job_dir into a per-job PVC subdir and how slurm keys packaging by experiment_id:job_name. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index a122e361..3e9bfee5 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -173,10 +173,18 @@ def nnodes(self) -> int: def code_dir(self) -> str: """Subdirectory on the PVC where user code (launch.sh, scripts) is synced. - Scoped to ``//code`` so multiple users sharing - the same PVC never clobber each other's files. + Scoped to ``////code`` + so that neither multiple users *nor* multiple concurrent jobs from the + same user clobber each other's launcher code on a shared PVC — each + ``package()`` rsyncs its ``job_dir`` here, so an unscoped path lets a + second job overwrite the first job's code mid-run. Falls back to a bare + ``/code`` only before the executor is assigned to a task. """ - return f"{self.workdir_pvc_path.rstrip('/')}/{getpass.getuser()}/code" + parts = [ + p for p in (getattr(self, "experiment_id", None), getattr(self, "job_name", None)) if p + ] + scope = "/".join([getpass.getuser(), *parts]) + return f"{self.workdir_pvc_path.rstrip('/')}/{scope}/code" def nproc_per_node(self) -> int: """Return processes per node: nprocs_per_node → gpus_per_node → 1.""" From 636ec9917bd41d5ed16f2de377cfafcffab05aad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 30 May 2026 23:00:21 +0000 Subject: [PATCH 07/20] fix(kubeflow): unique TrainJob name + forward all ranks (deduped) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TrainJob name is now - (RFC-1123, <=33 chars) via the new train_job_basename field, decoupled from the experiment name. The uuid makes every launch unique, so concurrent/retried jobs never collide on the API server (the descriptive experiment name is intentionally non-unique). - fetch_logs now forwards every rank to stdout, de-duplicated: torchrun runs the same entrypoint on all ranks so startup/config/NCCL lines are identical; we strip the per-rank [pod/...]/[defaultN] markers and forward each distinct message once. This stops dropping the per-step loss line and wandb URL, which Megatron emits from a single layout-dependent rank (neither rank 0 nor last). The full per-rank stream still goes to log-allranks_0.out untouched. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 152 ++++++++++++++-------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 3e9bfee5..9d0dae47 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -14,12 +14,12 @@ # limitations under the License. import getpass -import json import logging import os import re import subprocess import time +import uuid from dataclasses import dataclass, field from enum import Enum from typing import Any, Iterable, Optional @@ -110,6 +110,10 @@ class KubeflowExecutor(Executor): # the PVC sync. Use this to include local scripts/files that are not # generated by the packager (e.g. a hand-written training script). workdir_local_path: Optional[str] = None + # Human-readable base for the generated TrainJob name. The k8s name becomes + # ``-`` (RFC-1123 safe, ≤33 chars); the uuid keeps every + # launch unique. Falls back to the launch ``name`` when unset. + train_job_basename: Optional[str] = None def __post_init__(self): if not _KUBERNETES_AVAILABLE: @@ -281,6 +285,26 @@ def get_job_body(self, name: str, command: list[str]) -> dict: # ── Submit / status / cancel / logs ────────────────────────────────────── + def _trainjob_name(self, fallback: str) -> str: + """RFC-1123 base name ``-`` (≤33 chars), generated once. + + Shared by the TrainJob and its data-mover pod (created in ``package()``, + before ``launch()``) so both are valid, unique per launch — the uuid + avoids API-server collisions — and consistent. The basename is + ``train_job_basename`` (e.g. the model recipe) or the caller's name, + sanitized to lowercase alphanumerics + dashes; capped at 33 chars to + stay under the 63-char label limit with room for the ``-data-mover`` + suffix. + """ + cached = getattr(self, "_k8s_job_name", None) + if cached is not None: + return cached + base = re.sub(r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower()).strip("-") + uid = uuid.uuid4().hex[:6] + base = base[: 33 - len(uid) - 1].strip("-") or "job" + self._k8s_job_name = f"{base}-{uid}" + return self._k8s_job_name + def launch( self, name: str, @@ -295,7 +319,7 @@ def launch( observed ``RUNNING``, ``SUCCEEDED``, or ``FAILED`` state when *wait* is ``True``. Raises ``RuntimeError`` if the job already exists or *timeout* expires. """ - name = name.replace("_", "-").replace(".", "-").lower() + name = self._trainjob_name(name) job_body = self.get_job_body(name, cmd) try: self._custom_objects_api.create_namespaced_custom_object( @@ -413,21 +437,17 @@ def fetch_logs( *lines* lines from a single ``kubectl logs`` call. """ # Tail every rank to /log-allranks_0.out (downstream log - # validation globs log*.out and needs every rank), but forward only - # global rank 0 and the *last* global rank to the caller (stdout / CI - # job log) — streaming all ranks at scale overruns CI job-log limits. - # - # Identifying the last global rank requires the authoritative node rank, - # NOT the pod name. Kubeflow Trainer binds torchrun's PET_NODE_RANK to - # the indexed-Job completion index, stamped on each pod as the - # `batch.kubernetes.io/job-completion-index` label. So: - # global_rank = job_completion_index * nproc_per_node + local_rank - # `--prefix` tags each line with `[pod//]`; we map that - # pod name → completion index (refreshed on every (re)connect, since a - # gang restart spawns new pod names) and pair it with torchrun's - # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full - # history on (re)attach so mid-run lines are never dropped (the previous - # `--tail ` snapshot missed the last rank's per-step lines). + # validation globs log*.out and needs every rank). Forward all ranks to + # the caller (stdout / CI job log) too, but de-duplicated: torchrun runs + # the same entrypoint on every rank, so the bulk of the volume (startup, + # config dump, NCCL init) is byte-identical across ranks. We forward each + # distinct message once — which is also why the rank-specific loss line + # (emitted by a single, parallelism-layout-dependent rank that is usually + # neither rank 0 nor the last rank) and genuine per-rank errors are no + # longer dropped. `--prefix` tags each line with `[pod//]` + # and torchrun adds `[defaultN]`; both are stripped to form the dedup key. + # `--tail=-1` replays each pod's full history on (re)attach so mid-run + # lines are never dropped. label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}" cmd = [ "kubectl", @@ -442,61 +462,43 @@ def fetch_logs( "--max-log-requests", str(self.num_nodes), ] - nproc = self.nproc_per_node() - last_rank = max(self.num_nodes * nproc - 1, 0) - pod_re = re.compile(r"pod/([^/]+)/") - local_re = re.compile(r"\[default(\d+)\]") - - def _pod_index_map() -> dict[str, int]: - """Map pod name → job-completion-index (== torchrun node rank).""" - try: - out = subprocess.run( - [ - "kubectl", - "get", - "pods", - "-n", - self.namespace, - "-l", - label_selector, - "-o", - "json", - ], - capture_output=True, - text=True, - timeout=timeout, - ) - items = json.loads(out.stdout).get("items", []) - except Exception as e: - logger.warning("Could not list pods for %s: %s", job_name, e) - return {} - mapping: dict[str, int] = {} - for item in items: - meta = item.get("metadata", {}) - name = meta.get("name") - idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index") - if name is not None and idx is not None: - mapping[name] = int(idx) - return mapping - - def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: - """True for the first and last *global rank* only. - - Kubeflow Trainer sets torchrun's PET_NODE_RANK from the JobSet - completion-index label (static), so the global rank is - ``node_rank * nproc_per_node + local_rank`` where node_rank is the - pod's completion index and local_rank is torchrun's ``[defaultN]`` - marker. We forward global rank 0 and ``world_size - 1`` only. - """ - pod_match = pod_re.search(log_line) - local_match = local_re.search(log_line) - if not pod_match or not local_match: + # Collapse the near-simultaneous cross-rank burst with a *sliding time + # window* (cf. ClusterShell `clush -b`, which gathers identical output + # across nodes into one line). torchrun runs the same entrypoint on + # every rank, so startup/config/NCCL lines arrive as a burst of + # byte-identical copies; we strip the per-rank `[pod//]` + # and `[defaultN]` markers to form a dedup key and suppress a key only + # if an identical line was already forwarded within `dedup_window_s`. + # Unlike a global set this is bounded in both memory and time: a line + # that legitimately recurs later (e.g. a periodic "saving checkpoint") + # is forwarded again once the window passes, and a continuously + # repeating line is rate-limited to once per window rather than + # suppressed for the whole run. Lines whose body differs across ranks + # (per-step loss, `[rankN]` errors) keep distinct keys and are never + # collapsed. The full per-rank stream still goes to log-allranks_0.out. + rank_marker_re = re.compile(r"\[pod/[^\]]+\]\s*|\[default\d+\]:?\s*") + dedup_window_s = 60.0 + last_forwarded: dict[str, float] = {} + + def _should_forward(log_line: str) -> bool: + key = rank_marker_re.sub("", log_line).strip() + if not key: + # Blank / prefix-only line: no content, so don't forward it to + # the CI log (every rank emits these; they're pure noise). The + # full per-rank stream still captures them in log-allranks_0.out. return False - node = pod_index.get(pod_match.group(1)) - if node is None: + now = time.monotonic() + prev = last_forwarded.get(key) + if prev is not None and now - prev < dedup_window_s: return False - global_rank = node * nproc + int(local_match.group(1)) - return global_rank == 0 or global_rank == last_rank + last_forwarded[key] = now + # Bound memory: once the map is large, drop keys older than the + # window (they can no longer suppress anything). + if len(last_forwarded) > 20000: + stale = now - dedup_window_s + for k in [k for k, t in last_forwarded.items() if t < stale]: + del last_forwarded[k] + return True all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") os.makedirs(self.job_dir, exist_ok=True) @@ -506,7 +508,6 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: # Retry kubectl logs -f until the job reaches a terminal state. # This handles both pods not yet running and transient mid-stream failures. while True: - pod_index = _pod_index_map() proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 ) @@ -516,14 +517,14 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: for line in iter(proc.stdout.readline, ""): if line: all_ranks_file.write(line) - if _forward_to_stdout(line, pod_index): + if _should_forward(line): lines_yielded += 1 yield line if proc.poll() is not None: for remaining in proc.stdout: if remaining: all_ranks_file.write(remaining) - if _forward_to_stdout(remaining, pod_index): + if _should_forward(remaining): lines_yielded += 1 yield remaining break @@ -543,12 +544,11 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: ) time.sleep(5) else: - pod_index = _pod_index_map() result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) with open(all_ranks_path, "a") as all_ranks_file: for line in result.stdout.splitlines(): all_ranks_file.write(line + "\n") - if _forward_to_stdout(line, pod_index): + if _should_forward(line): yield line def cancel( @@ -614,7 +614,7 @@ def cancel( # ── Workdir sync helpers ────────────────────────────────────────────────── def _data_mover_pod_name(self, job_name: str) -> str: - return f"{job_name}-data-mover" + return f"{self._trainjob_name(job_name)}-data-mover" def _start_data_mover_pod(self, pod_name: str, timeout: int = 120) -> None: """Spin up a throw-away Alpine pod that mounts workdir_pvc and blocks until Running. From 7af08d28131cb52bb30b82fd16287c23aa291d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 00:09:18 +0000 Subject: [PATCH 08/20] fix(kubeflow): stream logs once, not per replica MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit torchx calls scheduler.log_iter(app_id, role_name, k=...) once per replica (k = 0..num_nodes-1). The Kubeflow log_iter ignored k and re-ran fetch_logs — which tails the entire jobset via the jobset-name selector — for every replica, producing N independent tail streams (each with its own dedup state) and N-fold-duplicating every console line (prefixed /). At 16 nodes that's 16x the log volume, which also overruns the CI job-log limit on long runs. Stream only for k == 0; that single tail already covers all ranks (and writes log-allranks_0.out once). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/run/torchx_backend/schedulers/kubeflow.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nemo_run/run/torchx_backend/schedulers/kubeflow.py b/nemo_run/run/torchx_backend/schedulers/kubeflow.py index be6e9db8..67d3ccf1 100644 --- a/nemo_run/run/torchx_backend/schedulers/kubeflow.py +++ b/nemo_run/run/torchx_backend/schedulers/kubeflow.py @@ -188,6 +188,14 @@ def log_iter( if not executor: return [] + # fetch_logs tails ALL pods of the jobset in a single call (it powers the + # log-allranks_0.out capture and the cross-rank dedup). torchx invokes + # log_iter once per replica (k = 0..num_nodes-1); streaming on every k + # would re-tail the whole jobset N times — each tail with its own dedup + # state — and N×-duplicate every console line. Stream only for k == 0. + if k != 0: + return [] + logs = executor.fetch_logs(job_name=job_name, stream=should_tail) if isinstance(logs, str): if len(logs) == 0: From 22168ed13ebb9c8d674b3327f3aae6d9580c07ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 00:22:08 +0000 Subject: [PATCH 09/20] fix(kubeflow): forward rank 0 + last rank to stdout (not all-ranks dedup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert the all-ranks sliding-window dedup back to forwarding only global rank 0 (setup/config) and the last global rank (print_rank_last per-step loss), like a SLURM job log. The last rank is resolved at stream time from each pod's batch.kubernetes.io/job-completion-index label (== torchrun --node-rank $PET_NODE_RANK), so global_rank = completion_index * nproc_per_node + local_rank is deterministic without any topology enforcement. The full per-rank stream is still captured in log-allranks_0.out. Combined with the per-replica log_iter guard, this stops the N-fold duplication and yields a clean two-rank console. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 91 ++++++++++++++++++----------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 9d0dae47..1e5e25f8 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -14,6 +14,7 @@ # limitations under the License. import getpass +import json import logging import os import re @@ -437,17 +438,20 @@ def fetch_logs( *lines* lines from a single ``kubectl logs`` call. """ # Tail every rank to /log-allranks_0.out (downstream log - # validation globs log*.out and needs every rank). Forward all ranks to - # the caller (stdout / CI job log) too, but de-duplicated: torchrun runs - # the same entrypoint on every rank, so the bulk of the volume (startup, - # config dump, NCCL init) is byte-identical across ranks. We forward each - # distinct message once — which is also why the rank-specific loss line - # (emitted by a single, parallelism-layout-dependent rank that is usually - # neither rank 0 nor the last rank) and genuine per-rank errors are no - # longer dropped. `--prefix` tags each line with `[pod//]` - # and torchrun adds `[defaultN]`; both are stripped to form the dedup key. - # `--tail=-1` replays each pod's full history on (re)attach so mid-run - # lines are never dropped. + # validation globs log*.out and needs every rank), but forward only + # global rank 0 and the *last* global rank to the caller (stdout / CI + # job log) — rank 0 carries setup/config, the last rank carries + # Megatron's print_rank_last per-step loss/throughput. + # + # The last global rank is resolved at stream time, not the pod name: + # the trainer runs `torchrun --node-rank $PET_NODE_RANK`, and the runtime + # sets PET_NODE_RANK = the pod's `batch.kubernetes.io/job-completion-index` + # label, so global_rank = completion_index * nproc_per_node + local_rank + # is deterministic. `--prefix` tags each line `[pod//]`; we + # map that pod name → completion index (re-read on every (re)connect, since + # a gang restart spawns new pod names) and pair it with torchrun's + # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full + # history on (re)attach so mid-run lines are never dropped. label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}" cmd = [ "kubectl", @@ -476,29 +480,44 @@ def fetch_logs( # suppressed for the whole run. Lines whose body differs across ranks # (per-step loss, `[rankN]` errors) keep distinct keys and are never # collapsed. The full per-rank stream still goes to log-allranks_0.out. - rank_marker_re = re.compile(r"\[pod/[^\]]+\]\s*|\[default\d+\]:?\s*") - dedup_window_s = 60.0 - last_forwarded: dict[str, float] = {} - - def _should_forward(log_line: str) -> bool: - key = rank_marker_re.sub("", log_line).strip() - if not key: - # Blank / prefix-only line: no content, so don't forward it to - # the CI log (every rank emits these; they're pure noise). The - # full per-rank stream still captures them in log-allranks_0.out. + nproc = self.nproc_per_node() + last_rank = max(self.num_nodes * nproc - 1, 0) + pod_re = re.compile(r"pod/([^/]+)/") + local_re = re.compile(r"\[default(\d+)\]") + + def _pod_index_map() -> dict[str, int]: + """Map pod name → job-completion-index (== torchrun node rank).""" + try: + out = subprocess.run( + ["kubectl", "get", "pods", "-n", self.namespace, "-l", label_selector, "-o", "json"], + capture_output=True, + text=True, + timeout=timeout, + ) + items = json.loads(out.stdout).get("items", []) + except Exception as e: + logger.warning("Could not list pods for %s: %s", job_name, e) + return {} + mapping: dict[str, int] = {} + for item in items: + meta = item.get("metadata", {}) + name = meta.get("name") + idx = (meta.get("labels", {}) or {}).get("batch.kubernetes.io/job-completion-index") + if name is not None and idx is not None: + mapping[name] = int(idx) + return mapping + + def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: + """True for global rank 0 and the last global rank only.""" + pod_match = pod_re.search(log_line) + local_match = local_re.search(log_line) + if not pod_match or not local_match: return False - now = time.monotonic() - prev = last_forwarded.get(key) - if prev is not None and now - prev < dedup_window_s: + node = pod_index.get(pod_match.group(1)) + if node is None: return False - last_forwarded[key] = now - # Bound memory: once the map is large, drop keys older than the - # window (they can no longer suppress anything). - if len(last_forwarded) > 20000: - stale = now - dedup_window_s - for k in [k for k, t in last_forwarded.items() if t < stale]: - del last_forwarded[k] - return True + global_rank = node * nproc + int(local_match.group(1)) + return global_rank == 0 or global_rank == last_rank all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") os.makedirs(self.job_dir, exist_ok=True) @@ -508,6 +527,7 @@ def _should_forward(log_line: str) -> bool: # Retry kubectl logs -f until the job reaches a terminal state. # This handles both pods not yet running and transient mid-stream failures. while True: + pod_index = _pod_index_map() proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 ) @@ -517,14 +537,14 @@ def _should_forward(log_line: str) -> bool: for line in iter(proc.stdout.readline, ""): if line: all_ranks_file.write(line) - if _should_forward(line): + if _forward_to_stdout(line, pod_index): lines_yielded += 1 yield line if proc.poll() is not None: for remaining in proc.stdout: if remaining: all_ranks_file.write(remaining) - if _should_forward(remaining): + if _forward_to_stdout(remaining, pod_index): lines_yielded += 1 yield remaining break @@ -544,11 +564,12 @@ def _should_forward(log_line: str) -> bool: ) time.sleep(5) else: + pod_index = _pod_index_map() result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) with open(all_ranks_path, "a") as all_ranks_file: for line in result.stdout.splitlines(): all_ranks_file.write(line + "\n") - if _should_forward(line): + if _forward_to_stdout(line, pod_index): yield line def cancel( From c0d800d034a295171332056eb32aaeb9ec48e25f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 00:30:38 +0000 Subject: [PATCH 10/20] fix(kubeflow): forward rank-0 + the actual loss-rank slot to stdout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The c10d rendezvous assigns torch ranks by join order, not by JobSet completion-index, so torch's world_size-1 (print_rank_last's loss line) does NOT land on the highest completion-index. Verified on a live 16-node job: the loss prints on completion-index 9 (= num_nodes//2 + 1), local rank nproc-1 — not index 15. Forward exactly (index 0, local 0) and (index num_nodes//2 + 1, local nproc-1) so the console shows rank 0 setup + the per-step loss/throughput. Full per-rank capture remains in log-allranks_0.out. A deterministic completion-index->rank mapping (topology/static rank ordering) would let us compute this rather than match the observed slot. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 1e5e25f8..68dcb094 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -481,7 +481,6 @@ def fetch_logs( # (per-step loss, `[rankN]` errors) keep distinct keys and are never # collapsed. The full per-rank stream still goes to log-allranks_0.out. nproc = self.nproc_per_node() - last_rank = max(self.num_nodes * nproc - 1, 0) pod_re = re.compile(r"pod/([^/]+)/") local_re = re.compile(r"\[default(\d+)\]") @@ -507,8 +506,20 @@ def _pod_index_map() -> dict[str, int]: mapping[name] = int(idx) return mapping + # Two ranks worth surfacing to the CI console: rank 0 (setup/config) at + # completion-index 0 / local 0, and the rank that emits print_rank_last's + # per-step loss/throughput. The c10d rendezvous does NOT map completion + # index to torch rank identically (it assigns by join order), so torch's + # world_size-1 does not land on the highest completion-index. Empirically + # on this JobSet it lands on completion-index `num_nodes//2 + 1`, local + # rank `nproc-1` (e.g. 16 nodes → index 9; default7 on 8-GPU, default3 on + # 4-GPU). Match those two slots directly. (A deterministic completion + # index→rank mapping — e.g. topology-aware/static rank ordering — would + # let us compute this instead of relying on the observed slot.) + last_node = self.num_nodes // 2 + 1 + def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: - """True for global rank 0 and the last global rank only.""" + """True only for (index 0, local 0) and (index num_nodes//2+1, local nproc-1).""" pod_match = pod_re.search(log_line) local_match = local_re.search(log_line) if not pod_match or not local_match: @@ -516,8 +527,8 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: node = pod_index.get(pod_match.group(1)) if node is None: return False - global_rank = node * nproc + int(local_match.group(1)) - return global_rank == 0 or global_rank == last_rank + local = int(local_match.group(1)) + return (node == 0 and local == 0) or (node == last_node and local == nproc - 1) all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") os.makedirs(self.job_dir, exist_ok=True) From e8a64f5951002215b1f264df119c745792821b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 09:44:48 +0000 Subject: [PATCH 11/20] fix(kubeflow): robust log streaming across pod/container restarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fetch_logs ran a single 'kubectl logs -l -f --max-log-requests '. That follow only attaches to pods present at start, never re-attaches to a container that restarts, and --max-log-requests == pod count has no headroom — so a gang/NCCL-init restart that transiently doubled the matching-pod count errored the whole command ('maximum allowed concurrency') and silently dropped pods. Observed: a 16-node job streamed only node-0-0; the loss rank (node-0-9) never appeared even though it was emitting per-step loss. - --max-log-requests = max(num_nodes*2, 8): headroom for restart-transient pods. - Periodically re-attach (threading.Timer terminates the follow every 120s) so pods that (re)started after the initial attach are picked up. - Resume reconnects with --since-time (via --timestamps), tracking the max RFC3339 stamp, so re-attaching never replays already-emitted history; only the first attach uses --tail=-1. The kubectl timestamp is stripped from each line. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 106 +++++++++++++++------------- 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 68dcb094..745a9ba9 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -19,6 +19,7 @@ import os import re import subprocess +import threading import time import uuid from dataclasses import dataclass, field @@ -446,14 +447,22 @@ def fetch_logs( # The last global rank is resolved at stream time, not the pod name: # the trainer runs `torchrun --node-rank $PET_NODE_RANK`, and the runtime # sets PET_NODE_RANK = the pod's `batch.kubernetes.io/job-completion-index` - # label, so global_rank = completion_index * nproc_per_node + local_rank - # is deterministic. `--prefix` tags each line `[pod//]`; we - # map that pod name → completion index (re-read on every (re)connect, since - # a gang restart spawns new pod names) and pair it with torchrun's - # `[defaultN]` local-rank marker. `--tail=-1` replays each pod's full - # history on (re)attach so mid-run lines are never dropped. + # label. `--prefix` tags each line `[pod//]`; we map that + # pod name → completion index and pair it with torchrun's `[defaultN]` + # local-rank marker (see _forward_to_stdout). + # + # Robust streaming: `kubectl logs -l -f` only follows the pods present + # when it attaches and never re-attaches to a container that restarts, so + # a single long-lived follow silently drops pods after a gang/NCCL-init + # restart. We therefore (a) give --max-log-requests headroom so a restart + # that transiently doubles the matching-pod count can't error the whole + # command ("maximum allowed concurrency"), and (b) periodically re-attach. + # To avoid re-emitting the full history on every re-attach, reconnects + # resume via `--since-time` (with `--timestamps`); only the first attach + # uses `--tail=-1` to capture pre-existing history. label_selector = f"jobset.sigs.k8s.io/jobset-name={job_name}" - cmd = [ + max_log_requests = max(self.num_nodes * 2, 8) + base_cmd = [ "kubectl", "logs", "-l", @@ -461,25 +470,21 @@ def fetch_logs( "-n", self.namespace, "--prefix", - "--tail", - "-1", "--max-log-requests", - str(self.num_nodes), + str(max_log_requests), ] - # Collapse the near-simultaneous cross-rank burst with a *sliding time - # window* (cf. ClusterShell `clush -b`, which gathers identical output - # across nodes into one line). torchrun runs the same entrypoint on - # every rank, so startup/config/NCCL lines arrive as a burst of - # byte-identical copies; we strip the per-rank `[pod//]` - # and `[defaultN]` markers to form a dedup key and suppress a key only - # if an identical line was already forwarded within `dedup_window_s`. - # Unlike a global set this is bounded in both memory and time: a line - # that legitimately recurs later (e.g. a periodic "saving checkpoint") - # is forwarded again once the window passes, and a continuously - # repeating line is rate-limited to once per window rather than - # suppressed for the whole run. Lines whose body differs across ranks - # (per-step loss, `[rankN]` errors) keep distinct keys and are never - # collapsed. The full per-rank stream still goes to log-allranks_0.out. + # `--prefix --timestamps` lines look like: + # [pod//] [defaultN]: + # Track the max RFC3339 stamp to resume via --since-time, and strip it so + # downstream sees the original `[pod/...] [defaultN]: `. + ts_re = re.compile(r"^(\[pod/[^\]]+\])\s+(\d{4}-\d\d-\d\dT[\d:.]+Z)\s") + + def _split_ts(line: str) -> tuple[Optional[str], str]: + m = ts_re.match(line) + if not m: + return None, line + return m.group(2), m.group(1) + " " + line[m.end() :] + nproc = self.nproc_per_node() pod_re = re.compile(r"pod/([^/]+)/") local_re = re.compile(r"\[default(\d+)\]") @@ -534,49 +539,48 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: os.makedirs(self.job_dir, exist_ok=True) if stream: - cmd.append("-f") - # Retry kubectl logs -f until the job reaches a terminal state. - # This handles both pods not yet running and transient mid-stream failures. + reattach_interval_s = 120.0 + since_time: Optional[str] = None while True: pod_index = _pod_index_map() + attempt_cmd = base_cmd + ["--timestamps", "-f"] + # First attach replays history (--tail=-1); reconnects resume from + # the last seen timestamp so re-attaching never re-emits old lines. + attempt_cmd += ["--tail", "-1"] if since_time is None else ["--since-time", since_time] proc = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 + attempt_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 ) - lines_yielded = 0 + # Force a periodic re-attach (terminate → reconnect) so pods that + # (re)started after this attach are picked up; --since-time keeps + # the reconnect from replaying history. + reattach_timer = threading.Timer(reattach_interval_s, proc.terminate) + reattach_timer.start() try: with open(all_ranks_path, "a") as all_ranks_file: - for line in iter(proc.stdout.readline, ""): - if line: - all_ranks_file.write(line) - if _forward_to_stdout(line, pod_index): - lines_yielded += 1 - yield line - if proc.poll() is not None: - for remaining in proc.stdout: - if remaining: - all_ranks_file.write(remaining) - if _forward_to_stdout(remaining, pod_index): - lines_yielded += 1 - yield remaining - break + for raw in iter(proc.stdout.readline, ""): + if not raw: + continue + ts, line = _split_ts(raw) + if ts is not None and (since_time is None or ts > since_time): + since_time = ts + all_ranks_file.write(line) + if _forward_to_stdout(line, pod_index): + yield line except Exception as e: logger.warning("Error streaming logs: %s; retrying", e) finally: + reattach_timer.cancel() proc.terminate() proc.wait(timeout=2) state = self.status(job_name) if state in (KubeflowJobState.SUCCEEDED, KubeflowJobState.FAILED): break # job reached a terminal state, stop streaming - logger.warning( - "kubectl logs exited (rc=%d, lines=%d, state=%s); retrying", - proc.returncode, - lines_yielded, - state, - ) - time.sleep(5) + time.sleep(2) else: pod_index = _pod_index_map() - result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + result = subprocess.run( + base_cmd + ["--tail", "-1"], capture_output=True, text=True, timeout=timeout + ) with open(all_ranks_path, "a") as all_ranks_file: for line in result.stdout.splitlines(): all_ranks_file.write(line + "\n") From ec27ed58d5a150630ff8f59ffe075cf609af7353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 09:53:39 +0000 Subject: [PATCH 12/20] fix(kubeflow): resolve rank-0/last pods from worker GROUP_RANK, not a heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The console forwarded rank 0 + the loss rank using completion-index with an empirical 'num_nodes//2+1' slot for world_size-1. That's fragile: the c10d rendezvous assigns torch ranks by join order, not JobSet completion-index, so the loss rank lands on an unpredictable pod (observed: completion-index 9 was actually GROUP_RANK 15 = RANK 63 = world_size-1). Read the ground truth instead: torchrun exports GROUP_RANK into every worker's /proc//environ, so 'kubectl exec -- ' reading it tells us exactly which pod holds GROUP_RANK 0 (RANK 0, local 0) and GROUP_RANK num_nodes-1 (RANK world_size-1, local nproc-1). Resolve the pod->GROUP_RANK map once the workers exist, cache it, and re-resolve when the rank-0/last pod is no longer covered (gang restart reshuffles ranks). Until workers come up (empty map), fall back to the completion-index-0 pod so early setup still streams. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 71 ++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 745a9ba9..3e287f72 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -511,29 +511,66 @@ def _pod_index_map() -> dict[str, int]: mapping[name] = int(idx) return mapping - # Two ranks worth surfacing to the CI console: rank 0 (setup/config) at - # completion-index 0 / local 0, and the rank that emits print_rank_last's - # per-step loss/throughput. The c10d rendezvous does NOT map completion - # index to torch rank identically (it assigns by join order), so torch's - # world_size-1 does not land on the highest completion-index. Empirically - # on this JobSet it lands on completion-index `num_nodes//2 + 1`, local - # rank `nproc-1` (e.g. 16 nodes → index 9; default7 on 8-GPU, default3 on - # 4-GPU). Match those two slots directly. (A deterministic completion - # index→rank mapping — e.g. topology-aware/static rank ordering — would - # let us compute this instead of relying on the observed slot.) - last_node = self.num_nodes // 2 + 1 + # Forward only RANK 0 (setup/config) and RANK world_size-1 (Megatron's + # print_rank_last per-step loss/throughput). The c10d rendezvous assigns + # torch ranks by join order, NOT by JobSet completion-index, so we read + # the ground truth: torchrun exports GROUP_RANK (the node rank) into every + # worker's /proc//environ. The pod whose worker has GROUP_RANK 0 + # holds RANK 0 (local 0); the pod with GROUP_RANK num_nodes-1 holds + # RANK world_size-1 (local nproc-1). The map is resolved once the workers + # exist (post-rendezvous) and cached; it is re-resolved when the rank-0 or + # last pod is no longer covered (a gang restart reshuffles ranks). Before + # the workers come up (empty map) we fall back to the completion-index-0 + # pod so early setup output still streams. + last_group_rank = self.num_nodes - 1 + group_rank_map: dict[str, int] = {} + + def _read_group_rank(pod: str) -> Optional[int]: + """Read a torchrun worker's GROUP_RANK from /proc//environ in *pod*.""" + script = ( + "for e in /proc/[0-9]*/environ; do " + "g=$(tr '\\0' '\\n' < \"$e\" 2>/dev/null | grep -m1 '^GROUP_RANK='); " + "[ -n \"$g\" ] && { echo \"$g\"; break; }; done" + ) + try: + out = subprocess.run( + ["kubectl", "exec", pod, "-n", self.namespace, "-c", "node", "--", "sh", "-c", script], + capture_output=True, + text=True, + timeout=min(timeout, 30), + ) + except Exception: + return None + m = re.search(r"GROUP_RANK=(\d+)", out.stdout) + return int(m.group(1)) if m else None + + def _ensure_group_ranks(current_pods: set[str]) -> None: + """Resolve pod → GROUP_RANK via worker environ if rank 0 / last not yet covered.""" + covered = {group_rank_map[p] for p in current_pods if p in group_rank_map} + if 0 in covered and last_group_rank in covered: + return + group_rank_map.clear() + for pod in current_pods: + g = _read_group_rank(pod) + if g is not None: + group_rank_map[pod] = g def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: - """True only for (index 0, local 0) and (index num_nodes//2+1, local nproc-1).""" + """True only for RANK 0 and RANK world_size-1. + + Uses the resolved GROUP_RANK map; before the workers come up (empty + map) falls back to the completion-index-0 pod for early setup output. + """ pod_match = pod_re.search(log_line) local_match = local_re.search(log_line) if not pod_match or not local_match: return False - node = pod_index.get(pod_match.group(1)) - if node is None: - return False + pod = pod_match.group(1) local = int(local_match.group(1)) - return (node == 0 and local == 0) or (node == last_node and local == nproc - 1) + gr = group_rank_map.get(pod) + if gr is not None: + return (gr == 0 and local == 0) or (gr == last_group_rank and local == nproc - 1) + return pod_index.get(pod) == 0 and local == 0 all_ranks_path = os.path.join(self.job_dir, "log-allranks_0.out") os.makedirs(self.job_dir, exist_ok=True) @@ -543,6 +580,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: since_time: Optional[str] = None while True: pod_index = _pod_index_map() + _ensure_group_ranks(set(pod_index)) attempt_cmd = base_cmd + ["--timestamps", "-f"] # First attach replays history (--tail=-1); reconnects resume from # the last seen timestamp so re-attaching never re-emits old lines. @@ -578,6 +616,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: time.sleep(2) else: pod_index = _pod_index_map() + _ensure_group_ranks(set(pod_index)) result = subprocess.run( base_cmd + ["--tail", "-1"], capture_output=True, text=True, timeout=timeout ) From 213ba39e41b273620f8b9988d603c529caa154ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 13:20:14 +0000 Subject: [PATCH 13/20] fix(kubeflow): emit forwarded log lines in timestamp order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'kubectl logs -l ... -f' multiplexes every pod into one stream in ARRIVAL order, not timestamp order. Because the console forwards two pods (rank 0 and the last rank), their lines could interleave wrong — e.g. two rank-0 'Step Time' lines bunching before the last rank's 'iteration N' line, or a step time landing under the next iteration. Add a small reorder buffer on the forwarded (yielded) subset only: each line already carries the kubelet --timestamps value (parsed to epoch via the new _ts_epoch), so hold lines until they are older than reorder_hold_s (2s) and emit sorted by timestamp. The window comfortably absorbs cross-node clock skew + flush jitter while keeping the console near-live. The buffer is drained in order after each proc ends (re-attach) — outside finally, since yielding during generator close is unsafe. The full all-ranks debug file is untouched. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 40 ++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 3e287f72..d8cf975d 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import calendar import getpass import json import logging @@ -485,6 +486,17 @@ def _split_ts(line: str) -> tuple[Optional[str], str]: return None, line return m.group(2), m.group(1) + " " + line[m.end() :] + epoch_re = re.compile(r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)(?:\.(\d+))?Z?$") + + def _ts_epoch(ts: str) -> Optional[float]: + """RFC3339 UTC stamp (kubectl --timestamps, ns precision) → epoch seconds.""" + m = epoch_re.match(ts) + if not m: + return None + y, mo, d, h, mi, s = (int(m.group(i)) for i in range(1, 7)) + frac = float("0." + m.group(7)) if m.group(7) else 0.0 + return calendar.timegm((y, mo, d, h, mi, s, 0, 0, 0)) + frac + nproc = self.nproc_per_node() pod_re = re.compile(r"pod/([^/]+)/") local_re = re.compile(r"\[default(\d+)\]") @@ -577,6 +589,13 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: if stream: reattach_interval_s = 120.0 + # `kubectl logs -l ... -f` multiplexes pods in ARRIVAL order, so the two + # forwarded streams (rank 0 and the last rank, on different pods) can + # interleave out of timestamp order. Hold each forwarded line in a small + # buffer and emit sorted by the kubelet --timestamps value once it is + # older than REORDER_HOLD_S — long enough to absorb cross-node clock + # skew + flush jitter, short enough to keep the console near-live. + reorder_hold_s = 2.0 since_time: Optional[str] = None while True: pod_index = _pod_index_map() @@ -593,6 +612,7 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: # the reconnect from replaying history. reattach_timer = threading.Timer(reattach_interval_s, proc.terminate) reattach_timer.start() + reorder_buf: list[tuple[float, str]] = [] try: with open(all_ranks_path, "a") as all_ranks_file: for raw in iter(proc.stdout.readline, ""): @@ -602,14 +622,32 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: if ts is not None and (since_time is None or ts > since_time): since_time = ts all_ranks_file.write(line) - if _forward_to_stdout(line, pod_index): + if not _forward_to_stdout(line, pod_index): + continue + ep = _ts_epoch(ts) if ts else None + if ep is None: yield line + continue + reorder_buf.append((ep, line)) + reorder_buf.sort(key=lambda x: x[0]) + cutoff = ep - reorder_hold_s + ready = 0 + while ready < len(reorder_buf) and reorder_buf[ready][0] <= cutoff: + ready += 1 + for _, ready_line in reorder_buf[:ready]: + yield ready_line + del reorder_buf[:ready] except Exception as e: logger.warning("Error streaming logs: %s; retrying", e) finally: reattach_timer.cancel() proc.terminate() proc.wait(timeout=2) + # Flush the rest in timestamp order before re-attaching (yielding in + # finally is unsafe on generator close, so drain here). + reorder_buf.sort(key=lambda x: x[0]) + for _, ready_line in reorder_buf: + yield ready_line state = self.status(job_name) if state in (KubeflowJobState.SUCCEEDED, KubeflowJobState.FAILED): break # job reached a terminal state, stop streaming From 981e6f9e28001807132beb724ae2227fbb2da1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 15:16:40 +0000 Subject: [PATCH 14/20] feat(kubeflow): support pod-template annotations/labels (podTemplateOverrides metadata) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The executor's existing 'annotations' land on the TrainJob object. GKE multi-network attach (networking.gke.io/interfaces, for GPUDirect-RDMA/gIB) is read off the trainer POD, not the TrainJob — add pod_annotations (and pod_labels) that flow into podTemplateOverrides[].metadata, which the Kubeflow Trainer v2 CRD supports. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index d8cf975d..cd033c1b 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -93,6 +93,11 @@ class KubeflowExecutor(Executor): volumes: list[dict[str, Any]] = field(default_factory=list) labels: dict[str, Any] = field(default_factory=dict) annotations: dict[str, Any] = field(default_factory=dict) + # pod_annotations land on the trainer POD template (podTemplateOverrides[].metadata), + # not the TrainJob object — needed for e.g. GKE multi-network attach + # (networking.gke.io/interfaces) which is read off the pod, not the TrainJob. + pod_annotations: dict[str, Any] = field(default_factory=dict) + pod_labels: dict[str, Any] = field(default_factory=dict) tolerations: list[dict[str, Any]] = field(default_factory=list) affinity: dict[str, Any] = field(default_factory=dict) # env_list accepts full env var dicts (e.g. valueFrom/secretKeyRef). @@ -267,10 +272,18 @@ def get_job_body(self, name: str, command: list[str]) -> dict: "runtimeRef": {"name": self.runtime_ref}, "trainer": trainer, } - if pod_spec_override: - spec["podTemplateOverrides"] = [ - {"targetJobs": [{"name": "node"}], "spec": pod_spec_override} - ] + if pod_spec_override or self.pod_annotations or self.pod_labels: + override_entry: dict[str, Any] = {"targetJobs": [{"name": "node"}]} + if pod_spec_override: + override_entry["spec"] = pod_spec_override + pod_meta: dict[str, Any] = {} + if self.pod_labels: + pod_meta["labels"] = self.pod_labels + if self.pod_annotations: + pod_meta["annotations"] = self.pod_annotations + if pod_meta: + override_entry["metadata"] = pod_meta + spec["podTemplateOverrides"] = [override_entry] spec.update(self.spec_kwargs) metadata: dict[str, Any] = {"name": name, "namespace": self.namespace} From c23cecf7d5de6298307de4e1c52f0b71d02a1974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 31 May 2026 22:12:30 +0000 Subject: [PATCH 15/20] fix(kubeflow): resolve rank-0 and last rank before forwarding logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On first attach the GROUP_RANK pod map is empty until the torchrun workers finish rendezvous, so _forward_to_stdout fell back to rank-0-only and the last rank's early per-step loss/throughput lines (replayed via --tail=-1) were written to log-allranks but never forwarded to stdout — the CI log silently dropped the beginning of the run until a re-attach ~120s later, by which point --since-time skips the replayed history. Poll on the first attach until both rank 0 and the last rank resolve before forwarding, capped at 600s (then fall back). The wait is gated on a non-empty pod list, so it is a no-op when pods can't be listed (no kubectl / unit tests) and engages only for real runs. Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index cd033c1b..04a32b65 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -609,10 +609,40 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: # older than REORDER_HOLD_S — long enough to absorb cross-node clock # skew + flush jitter, short enough to keep the console near-live. reorder_hold_s = 2.0 + # First attach: resolve BOTH rank 0 and the last rank before forwarding + # any line. GROUP_RANK is only readable once the torchrun workers have + # rendezvoused, so the map is empty at first and _forward_to_stdout would + # fall back to rank-0-only — the last rank's early per-step loss lines + # (replayed via --tail=-1) would land in log-allranks but never reach + # stdout, silently dropping the beginning of the run from the CI log. + # Poll until both are resolved, capped so a run that never exposes + # GROUP_RANK still streams (with the completion-index fallback). + rank_resolve_timeout_s = 600.0 + rank_resolve_poll_s = 5.0 since_time: Optional[str] = None while True: pod_index = _pod_index_map() _ensure_group_ranks(set(pod_index)) + if since_time is None: + # First attach: wait until BOTH rank 0 and the last rank are + # resolved before forwarding, so the last rank's early per-step + # lines (replayed via --tail=-1) reach stdout instead of only + # log-allranks. Only wait while pods are actually listable; an + # empty list (no kubectl / unit tests) skips the wait and streams + # with the existing completion-index fallback. + resolve_deadline = time.time() + rank_resolve_timeout_s + while pod_index and not {0, last_group_rank} <= set(group_rank_map.values()): + if time.time() >= resolve_deadline: + logger.warning( + "rank 0 / last rank (%d) not both resolved within %.0fs; " + "forwarding with completion-index fallback", + last_group_rank, + rank_resolve_timeout_s, + ) + break + time.sleep(rank_resolve_poll_s) + pod_index = _pod_index_map() + _ensure_group_ranks(set(pod_index)) attempt_cmd = base_cmd + ["--timestamps", "-f"] # First attach replays history (--tail=-1); reconnects resume from # the last seen timestamp so re-attaching never re-emits old lines. From 2b344dc553cc98e3244b04415c334cb213c259c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Jun 2026 06:59:36 +0000 Subject: [PATCH 16/20] fix(kubeflow): wait for rank-0/last to resolve, never fall back to completion-index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first-attach barrier capped the wait at 600s and then forwarded with the completion-index heuristic, which streams the wrong rank. A job can legitimately sit Pending (starved for nodes) far longer than 600s, so it would time out and mis-forward. Drop the timeout/fallback: keep polling while the job is alive and stop only when it reaches a terminal state. --tail=-1 on first attach replays history, so waiting loses nothing. Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 04a32b65..a6e28a36 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -617,7 +617,6 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: # stdout, silently dropping the beginning of the run from the CI log. # Poll until both are resolved, capped so a run that never exposes # GROUP_RANK still streams (with the completion-index fallback). - rank_resolve_timeout_s = 600.0 rank_resolve_poll_s = 5.0 since_time: Optional[str] = None while True: @@ -627,18 +626,17 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: # First attach: wait until BOTH rank 0 and the last rank are # resolved before forwarding, so the last rank's early per-step # lines (replayed via --tail=-1) reach stdout instead of only - # log-allranks. Only wait while pods are actually listable; an - # empty list (no kubectl / unit tests) skips the wait and streams - # with the existing completion-index fallback. - resolve_deadline = time.time() + rank_resolve_timeout_s + # log-allranks. Never fall back to the completion-index heuristic + # — it forwards the wrong rank. The job may sit Pending (waiting + # for nodes) or be mid-rendezvous, so keep waiting while it is + # alive; --tail=-1 on first attach replays history, so nothing is + # lost by waiting. Stop only if the job reaches a terminal state + # (or pods aren't listable at all — e.g. no kubectl / unit tests). while pod_index and not {0, last_group_rank} <= set(group_rank_map.values()): - if time.time() >= resolve_deadline: - logger.warning( - "rank 0 / last rank (%d) not both resolved within %.0fs; " - "forwarding with completion-index fallback", - last_group_rank, - rank_resolve_timeout_s, - ) + if self.status(job_name) in ( + KubeflowJobState.SUCCEEDED, + KubeflowJobState.FAILED, + ): break time.sleep(rank_resolve_poll_s) pod_index = _pod_index_map() From 2870f128d0802bfbb1fdfa50276031263d7077c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Jun 2026 10:15:57 +0000 Subject: [PATCH 17/20] style(kubeflow): ruff-format kubeflow.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 42 ++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index a6e28a36..8599e612 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -315,7 +315,9 @@ def _trainjob_name(self, fallback: str) -> str: cached = getattr(self, "_k8s_job_name", None) if cached is not None: return cached - base = re.sub(r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower()).strip("-") + base = re.sub( + r"[^a-z0-9-]+", "-", (self.train_job_basename or fallback or "job").lower() + ).strip("-") uid = uuid.uuid4().hex[:6] base = base[: 33 - len(uid) - 1].strip("-") or "job" self._k8s_job_name = f"{base}-{uid}" @@ -518,7 +520,17 @@ def _pod_index_map() -> dict[str, int]: """Map pod name → job-completion-index (== torchrun node rank).""" try: out = subprocess.run( - ["kubectl", "get", "pods", "-n", self.namespace, "-l", label_selector, "-o", "json"], + [ + "kubectl", + "get", + "pods", + "-n", + self.namespace, + "-l", + label_selector, + "-o", + "json", + ], capture_output=True, text=True, timeout=timeout, @@ -555,11 +567,23 @@ def _read_group_rank(pod: str) -> Optional[int]: script = ( "for e in /proc/[0-9]*/environ; do " "g=$(tr '\\0' '\\n' < \"$e\" 2>/dev/null | grep -m1 '^GROUP_RANK='); " - "[ -n \"$g\" ] && { echo \"$g\"; break; }; done" + '[ -n "$g" ] && { echo "$g"; break; }; done' ) try: out = subprocess.run( - ["kubectl", "exec", pod, "-n", self.namespace, "-c", "node", "--", "sh", "-c", script], + [ + "kubectl", + "exec", + pod, + "-n", + self.namespace, + "-c", + "node", + "--", + "sh", + "-c", + script, + ], capture_output=True, text=True, timeout=min(timeout, 30), @@ -644,9 +668,15 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool: attempt_cmd = base_cmd + ["--timestamps", "-f"] # First attach replays history (--tail=-1); reconnects resume from # the last seen timestamp so re-attaching never re-emits old lines. - attempt_cmd += ["--tail", "-1"] if since_time is None else ["--since-time", since_time] + attempt_cmd += ( + ["--tail", "-1"] if since_time is None else ["--since-time", since_time] + ) proc = subprocess.Popen( - attempt_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, bufsize=1 + attempt_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + bufsize=1, ) # Force a periodic re-attach (terminate → reconnect) so pods that # (re)started after this attach are picked up; --since-time keeps From b6c3d8fb4b127d1fd608152fca69faab4cbe1713 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Jun 2026 10:31:53 +0000 Subject: [PATCH 18/20] test(kubeflow): update stale tests for uuid names, idempotent 409, rank-0/last log forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The base kubeflow rewrite changed behavior the tests still asserted the old way: TrainJob names are now -; a 409 cancels the stale job and recreates (idempotent) rather than raising; and fetch_logs writes every rank to /log-allranks_0.out while forwarding only rank-0 + the last rank to stdout. Set job_dir, patch status/time.sleep to avoid the retry-loop hang, and assert the all-ranks file + uuid-suffixed names. Signed-off-by: oliver könig --- test/core/execution/test_kubeflow.py | 82 ++++++++++++++++++---------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py index eb406fa9..4f037a8b 100644 --- a/test/core/execution/test_kubeflow.py +++ b/test/core/execution/test_kubeflow.py @@ -243,7 +243,8 @@ def test_launch_success(self, executor, mock_k8s_clients): mock_custom.create_namespaced_custom_object.return_value = {} job_name, state = executor.launch("test-job", ["/bin/bash", "-c", "echo hi"]) - assert job_name == "test-job" + # TrainJob names are - (RFC-1123 safe, unique per launch) + assert job_name.startswith("test-job-") and len(job_name) == len("test-job-") + 6 assert state == KubeflowJobState.CREATED mock_custom.create_namespaced_custom_object.assert_called_once() @@ -272,12 +273,18 @@ def test_launch_wait_timeout(self, executor, mock_k8s_clients): with pytest.raises(RuntimeError, match="did not reach RUNNING"): executor.launch("test-job", ["echo"], wait=True, timeout=-1) - def test_launch_conflict(self, executor, mock_k8s_clients): + def test_launch_conflict_recreates(self, executor, mock_k8s_clients): mock_custom, _ = mock_k8s_clients - mock_custom.create_namespaced_custom_object.side_effect = ApiException(status=409) + # A 409 means a stale TrainJob from a prior attempt lingers; launch cancels + # it and recreates so the caller's retry makes progress (idempotent launch). + mock_custom.create_namespaced_custom_object.side_effect = [ApiException(status=409), {}] - with pytest.raises(RuntimeError, match="already exists"): - executor.launch("test-job", ["/bin/bash", "-c", "echo hi"]) + with patch.object(executor, "cancel") as mock_cancel: + _, state = executor.launch("test-job", ["/bin/bash", "-c", "echo hi"]) + + mock_cancel.assert_called_once() + assert mock_custom.create_namespaced_custom_object.call_count == 2 + assert state == KubeflowJobState.CREATED def test_status_running(self, executor, mock_k8s_clients): mock_custom, _ = mock_k8s_clients @@ -346,34 +353,38 @@ def test_cancel_with_wait_timeout(self, executor, mock_k8s_clients): # ── Logs ───────────────────────────────────────────────────────────────────── - def test_fetch_logs_no_follow(self, executor, mock_k8s_clients): + def test_fetch_logs_no_follow(self, executor, mock_k8s_clients, tmp_path): + executor.job_dir = str(tmp_path) with patch("subprocess.run") as mock_run: mock_run.return_value = MagicMock(stdout="line1\nline2\n") - lines = list(executor.fetch_logs("my-job", stream=False, lines=50)) - - mock_run.assert_called_once() - called_cmd = mock_run.call_args[0][0] - assert "--tail" in called_cmd - assert "50" in called_cmd - label_arg = " ".join(called_cmd) - assert "jobset.sigs.k8s.io/jobset-name=my-job" in label_arg - assert "-f" not in called_cmd - assert lines == ["line1", "line2"] - - def test_fetch_logs_follow(self, executor, mock_k8s_clients): + list(executor.fetch_logs("my-job", stream=False, lines=50)) + + # the kubectl logs call (distinct from the pod-index lookup) targets the + # jobset and does not follow. + log_cmd = next(c.args[0] for c in mock_run.call_args_list if "logs" in c.args[0]) + assert "jobset.sigs.k8s.io/jobset-name=my-job" in " ".join(log_cmd) + assert "--tail" in log_cmd and "-f" not in log_cmd + # every rank is persisted to the all-ranks log + assert (tmp_path / "log-allranks_0.out").read_text() == "line1\nline2\n" + + def test_fetch_logs_follow(self, executor, mock_k8s_clients, tmp_path): import io + executor.job_dir = str(tmp_path) mock_proc = MagicMock() mock_proc.stdout = io.StringIO("line1\nline2\n") mock_proc.poll.return_value = None # still running; loop exits when readline() hits EOF - with patch("subprocess.Popen", return_value=mock_proc) as mock_popen: - lines = list(executor.fetch_logs("my-job", stream=True, lines=100)) + with ( + patch("subprocess.Popen", return_value=mock_proc) as mock_popen, + patch("time.sleep"), + patch.object(executor, "status", return_value=KubeflowJobState.SUCCEEDED), + ): + list(executor.fetch_logs("my-job", stream=True, lines=100)) - mock_popen.assert_called_once() - called_cmd = mock_popen.call_args[0][0] - assert "-f" in called_cmd - assert lines == ["line1\n", "line2\n"] + assert "-f" in mock_popen.call_args.args[0] + # every rank is persisted to the all-ranks log + assert (tmp_path / "log-allranks_0.out").read_text() == "line1\nline2\n" def test_status_unknown_when_empty(self, mock_k8s_clients): mock_custom, _ = mock_k8s_clients @@ -473,10 +484,14 @@ def test_pull_results_syncs_from_pvc(self, workdir_executor, mock_k8s_clients): mock_core.create_namespaced_pod.assert_called_once() assert mock_check_call.call_count == 1 # kubectl cp only (no mkdir for pull) cp_args = mock_check_call.call_args[0][0] - # kubectl cp /: + # kubectl cp /: ; the data-mover pod is named off the + # - TrainJob name. assert "kubectl" in cp_args assert "cp" in cp_args - assert f"test-job-data-mover:{workdir_executor.code_dir}" in cp_args + dm = next(a for a in cp_args if "-data-mover:" in a) + assert dm.startswith("test-job-") and dm.endswith( + f"-data-mover:{workdir_executor.code_dir}" + ) def test_pull_results_noop_without_workdir_pvc(self, mock_k8s_clients): e = KubeflowExecutor(image="test:latest") @@ -590,11 +605,14 @@ def test_launch_wait_exits_on_failed(self, executor, mock_k8s_clients): # ── fetch_logs streaming: retry until terminal state ───────────────────── - def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s_clients): + def test_fetch_logs_stream_retries_until_terminal_state( + self, executor, mock_k8s_clients, tmp_path + ): """First Popen yields nothing and job is RUNNING; second yields a line and job is SUCCEEDED — loop exits on terminal status.""" import io + executor.job_dir = str(tmp_path) empty_proc = MagicMock() empty_proc.stdout = io.StringIO("") empty_proc.poll.return_value = None @@ -607,6 +625,8 @@ def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s with ( patch("subprocess.Popen", side_effect=[empty_proc, output_proc]), + # no pods listable -> the rank-resolve barrier is a no-op (hermetic: no real kubectl) + patch("subprocess.run", return_value=MagicMock(stdout='{"items": []}')), patch("time.sleep"), patch.object( executor, @@ -614,13 +634,15 @@ def test_fetch_logs_stream_retries_until_terminal_state(self, executor, mock_k8s side_effect=[KubeflowJobState.RUNNING, KubeflowJobState.SUCCEEDED], ), ): - lines = list(executor.fetch_logs("my-job", stream=True)) + list(executor.fetch_logs("my-job", stream=True)) - assert "some output\n" in lines + # forwarded stdout is rank-0/last only, but every rank lands in the all-ranks log + assert "some output" in (tmp_path / "log-allranks_0.out").read_text() - def test_fetch_logs_stream_handles_exception(self, executor, mock_k8s_clients): + def test_fetch_logs_stream_handles_exception(self, executor, mock_k8s_clients, tmp_path): """Exception inside the readline loop is caught; loop exits when job is terminal.""" + executor.job_dir = str(tmp_path) mock_proc = MagicMock() mock_proc.stdout.readline.side_effect = OSError("read error") mock_proc.poll.return_value = None From 4e9346f5c126f248536063c54b469782afc56e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Jun 2026 10:50:36 +0000 Subject: [PATCH 19/20] test(kubeflow): cover GROUP_RANK resolution, log forwarding, client reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raises codecov/patch on the diff from 58% to ~98% by exercising the previously-untested branches: GROUP_RANK resolution via worker environ (incl. the first-attach resolve barrier), rank-0/last-rank forwarding + reorder buffer, the completion-index fallback, pod-template labels/ annotations, stale kube-client reload, and the status() connection-error retry path. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- test/core/execution/test_kubeflow.py | 164 ++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 1 deletion(-) diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py index 4f037a8b..278bc20f 100644 --- a/test/core/execution/test_kubeflow.py +++ b/test/core/execution/test_kubeflow.py @@ -18,7 +18,11 @@ import pytest from kubernetes.client.rest import ApiException -from nemo_run.core.execution.kubeflow import KubeflowExecutor, KubeflowJobState +from nemo_run.core.execution.kubeflow import ( + _KUBE_CLIENT_REFRESH_SECONDS, + KubeflowExecutor, + KubeflowJobState, +) class TestKubeflowExecutor: @@ -873,3 +877,161 @@ def test_lookup_job_dir_returns_empty_on_exception(self, mock_k8s_clients): with patch("nemo_run.config.get_nemorun_home", side_effect=Exception("boom")): result = e._lookup_job_dir("test-job") assert result == "" + + # ── get_job_body(): pod-template labels / annotations ───────────────────── + + def test_get_trainjob_body_pod_labels_and_annotations(self, mock_k8s_clients): + e = KubeflowExecutor( + image="test:latest", + pod_labels={"nemo-ci/job-id": "42"}, + pod_annotations={"sidecar.istio.io/inject": "false"}, + ) + body = e.get_job_body("pod-labeled", ["echo"]) + meta = body["spec"]["podTemplateOverrides"][0]["metadata"] + assert meta["labels"] == {"nemo-ci/job-id": "42"} + assert meta["annotations"] == {"sidecar.istio.io/inject": "false"} + + # ── _maybe_reload_kube_clients(): rebuild after the refresh interval ────── + + def test_maybe_reload_kube_clients_rebuilds_when_stale(self, executor): + import time + + # Anchor relative to now (monotonic()'s epoch is arbitrary / uptime-based), + # so age exceeds the refresh interval regardless of the runner's uptime. + executor._kube_clients_loaded_at = time.monotonic() - (_KUBE_CLIENT_REFRESH_SECONDS + 1) + with patch.object(executor, "_load_kube_clients") as mock_reload: + _ = executor._custom_objects_api + mock_reload.assert_called_once() + + def test_maybe_reload_kube_clients_skips_when_fresh(self, executor): + import time + + executor._kube_clients_loaded_at = time.monotonic() + with patch.object(executor, "_load_kube_clients") as mock_reload: + _ = executor._core_v1_api + mock_reload.assert_not_called() + + # ── status(): reload the kube client once on a non-API connection error ─── + + def test_status_reloads_kube_client_on_connection_error(self, executor, mock_k8s_clients): + mock_custom, _ = mock_k8s_clients + mock_custom.get_namespaced_custom_object.side_effect = [ + RuntimeError("expired client cert"), + {"status": {"jobsStatus": [{"active": 3, "ready": 3, "succeeded": 0, "failed": 0}]}}, + ] + with patch.object(executor, "_load_kube_clients") as mock_reload: + state = executor.status("my-job") + mock_reload.assert_called_once() + assert state == KubeflowJobState.RUNNING + + def test_status_returns_none_when_reload_does_not_help(self, executor, mock_k8s_clients): + mock_custom, _ = mock_k8s_clients + mock_custom.get_namespaced_custom_object.side_effect = RuntimeError("still broken") + with patch.object(executor, "_load_kube_clients"): + assert executor.status("my-job") is None + + # ── fetch_logs(stream): resolve GROUP_RANK, forward rank-0 + last only ──── + + def test_fetch_logs_stream_resolves_group_ranks_and_forwards( + self, executor, mock_k8s_clients, tmp_path + ): + """End-to-end stream: pods resolve their GROUP_RANK from worker environ + (after a first empty sweep that exercises the resolve barrier), and only + rank-0 + the last global rank reach stdout while every rank is persisted.""" + import io + import json + + executor.job_dir = str(tmp_path) + group_rank = {"pod-0": 0, "pod-1": 1, "pod-2": 2} + pods_json = json.dumps( + { + "items": [ + { + "metadata": { + "name": p, + "labels": {"batch.kubernetes.io/job-completion-index": str(i)}, + } + } + for i, p in enumerate(group_rank) + ] + } + ) + exec_calls = {"n": 0} + + def fake_run(cmd, *args, **kwargs): + if "exec" in cmd: + exec_calls["n"] += 1 + if exec_calls["n"] <= len(group_rank): # first sweep: workers not up yet + return MagicMock(stdout="") + return MagicMock(stdout=f"GROUP_RANK={group_rank[cmd[2]]}\n") + return MagicMock(stdout=pods_json) # kubectl get pods -o json + + stream = io.StringIO( + "[pod/pod-0/node] 2026-06-01T10:00:01.000000000Z [default0]: rank0-step\n" + "[pod/pod-1/node] 2026-06-01T10:00:01.500000000Z [default3]: mid-rank\n" + "[pod/pod-2/node] 2026-06-01T10:00:02.000000000Z [default7]: lastrank-step\n" + "[pod/pod-0/node] [default0]: no-timestamp-line\n" + ) + proc = MagicMock() + proc.stdout = stream + + with ( + patch("subprocess.run", side_effect=fake_run), + patch("subprocess.Popen", return_value=proc), + patch("time.sleep"), + patch.object( + executor, + "status", + side_effect=[KubeflowJobState.RUNNING, KubeflowJobState.SUCCEEDED], + ), + ): + forwarded = "".join(executor.fetch_logs("my-job", stream=True)) + + assert exec_calls["n"] >= 2 * len(group_rank) # both resolve sweeps ran + assert "rank0-step" in forwarded # GROUP_RANK 0, local 0 + assert "lastrank-step" in forwarded # GROUP_RANK 2 (== num_nodes-1), local 7 (== nproc-1) + assert "no-timestamp-line" in forwarded # forwarded immediately (no reorder buffer) + assert "mid-rank" not in forwarded # neither rank-0 nor last rank + all_ranks = (tmp_path / "log-allranks_0.out").read_text() + for marker in ("rank0-step", "mid-rank", "lastrank-step", "no-timestamp-line"): + assert marker in all_ranks + + def test_fetch_logs_no_follow_forwards_rank0_via_completion_index( + self, executor, mock_k8s_clients, tmp_path + ): + """When GROUP_RANK is unreadable, fall back to the completion-index-0 pod + for early setup output; the last rank is not forwarded without GROUP_RANK.""" + import json + + executor.job_dir = str(tmp_path) + pods_json = json.dumps( + { + "items": [ + { + "metadata": { + "name": f"pod-{i}", + "labels": {"batch.kubernetes.io/job-completion-index": str(i)}, + } + } + for i in range(3) + ] + } + ) + + def fake_run(cmd, *args, **kwargs): + if "exec" in cmd: + return MagicMock(stdout="") # GROUP_RANK not resolvable → completion-index fallback + if "logs" in cmd: + return MagicMock( + stdout="[pod/pod-0/node] [default0]: setup-output\n" + "[pod/pod-2/node] [default7]: last-output\n" + ) + return MagicMock(stdout=pods_json) + + with patch("subprocess.run", side_effect=fake_run): + forwarded = "".join(executor.fetch_logs("my-job", stream=False)) + + assert "setup-output" in forwarded # completion-index-0 fallback + assert "last-output" not in forwarded # no GROUP_RANK → last rank not forwarded + all_ranks = (tmp_path / "log-allranks_0.out").read_text() + assert "setup-output" in all_ranks and "last-output" in all_ranks From 71461c5969b442cad4b5101bf52c82042267768c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 2 Jun 2026 07:30:15 +0000 Subject: [PATCH 20/20] feat(kubeflow): add copy_to_workspace/copy_from_workspace for arbitrary PVC paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit package()/pull_results() already bridge launcher↔PVC via a throw-away data-mover pod, but only for the per-job code_dir. Downloading results (or persisting any auxiliary cross-run state) from another path on the volume had no public API. Add copy_to_workspace(local, remote) and copy_from_workspace(remote, local) that run the same data-mover against an arbitrary path under workdir_pvc_path, and refactor package()/pull_results() to delegate to them (behavior unchanged). Tests cover the happy path, the no-PVC no-op, and pod teardown on copy error. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: oliver könig --- nemo_run/core/execution/kubeflow.py | 63 ++++++++++++++++++++++------ test/core/execution/test_kubeflow.py | 43 +++++++++++++++++++ 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py index 8599e612..525a7c0d 100644 --- a/nemo_run/core/execution/kubeflow.py +++ b/nemo_run/core/execution/kubeflow.py @@ -937,6 +937,55 @@ def materialize_launch_script(self, cmd: list[str], max_retries: int = 0) -> Non f.write(script) logger.info("Wrote launch script to %s", launch_script_path) + def copy_to_workspace( + self, local_path: str, remote_path: str, label: str = "datamover" + ) -> None: + """Copy *local_path* (a directory) to *remote_path* on the workdir PVC. + + Generalizes :meth:`package`'s PVC sync to an arbitrary path on the volume — + not just the per-job ``code_dir`` — so callers can persist auxiliary + cross-run state (e.g. a metrics cache) anywhere under ``workdir_pvc_path`` + via the same throw-away data-mover pod. No-op when ``workdir_pvc`` is unset. + + Args: + local_path: Local directory whose contents are copied. + remote_path: Destination directory on the workdir PVC. + label: Disambiguates the data-mover pod name across concurrent transfers. + """ + if not self.workdir_pvc: + return + pod_name = self._data_mover_pod_name(label) + self._start_data_mover_pod(pod_name) + try: + self._rsync_to_pod(pod_name, local_path, remote_path) + finally: + self._delete_data_mover_pod(pod_name) + + def copy_from_workspace( + self, remote_path: str, local_path: str, label: str = "datamover" + ) -> None: + """Copy *remote_path* from the workdir PVC to *local_path*. + + Generalizes :meth:`pull_results` to an arbitrary path on the volume — not + just the per-job ``code_dir`` — so callers can read auxiliary cross-run + state via the same throw-away data-mover pod. No-op when ``workdir_pvc`` is + unset. Propagates the underlying ``kubectl cp`` error when *remote_path* + does not exist; callers that treat absence as normal should handle it. + + Args: + remote_path: Source directory on the workdir PVC. + local_path: Local destination directory. + label: Disambiguates the data-mover pod name across concurrent transfers. + """ + if not self.workdir_pvc: + return + pod_name = self._data_mover_pod_name(label) + self._start_data_mover_pod(pod_name) + try: + self._rsync_from_pod(pod_name, remote_path, local_path) + finally: + self._delete_data_mover_pod(pod_name) + def package(self, packager: Packager, job_name: str) -> None: """Sync job_dir to the workdir PVC via a temporary data-mover pod before launch. @@ -963,12 +1012,7 @@ def package(self, packager: Packager, job_name: str) -> None: # Sync job_dir to //code on the PVC via a # throw-away data-mover pod. Scoping to a user subdirectory means we # never clobber other data already on the shared volume. - pod_name = self._data_mover_pod_name(job_name) - self._start_data_mover_pod(pod_name) - try: - self._rsync_to_pod(pod_name, self.job_dir, self.code_dir) - finally: - self._delete_data_mover_pod(pod_name) + self.copy_to_workspace(self.job_dir, self.code_dir, label=job_name) # Mount the PVC so the training container can reach code_dir. # If the PVC is already declared (e.g. explicitly by the caller for data), @@ -1009,12 +1053,7 @@ def pull_results(self, job_name: str, dest_dir: Optional[str] = None) -> None: "Pass dest_dir explicitly or call via an executor that has job_dir set." ) - pod_name = self._data_mover_pod_name(job_name) - self._start_data_mover_pod(pod_name) - try: - self._rsync_from_pod(pod_name, self.code_dir, local_path) - finally: - self._delete_data_mover_pod(pod_name) + self.copy_from_workspace(self.code_dir, local_path, label=job_name) def _lookup_job_dir(self, job_name: str) -> str: """Look up the job_dir saved by the scheduler for *job_name*.""" diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py index 278bc20f..8babcf57 100644 --- a/test/core/execution/test_kubeflow.py +++ b/test/core/execution/test_kubeflow.py @@ -1035,3 +1035,46 @@ def fake_run(cmd, *args, **kwargs): assert "last-output" not in forwarded # no GROUP_RANK → last rank not forwarded all_ranks = (tmp_path / "log-allranks_0.out").read_text() assert "setup-output" in all_ranks and "last-output" in all_ranks + + # ── copy_to_workspace / copy_from_workspace (arbitrary-path PVC sync) ───── + + def test_copy_to_workspace_uses_data_mover(self, executor, mock_k8s_clients): + executor.workdir_pvc = "model-cache" + with ( + patch.object(executor, "_start_data_mover_pod") as start, + patch.object(executor, "_rsync_to_pod") as rsync, + patch.object(executor, "_delete_data_mover_pod") as delete, + ): + executor.copy_to_workspace("/local/dir", "/nemo-workspace/remote", label="x") + start.assert_called_once() + delete.assert_called_once() + assert rsync.call_args.args[1:] == ("/local/dir", "/nemo-workspace/remote") + + def test_copy_from_workspace_uses_data_mover(self, executor, mock_k8s_clients): + executor.workdir_pvc = "model-cache" + with ( + patch.object(executor, "_start_data_mover_pod"), + patch.object(executor, "_rsync_from_pod") as rsync, + patch.object(executor, "_delete_data_mover_pod") as delete, + ): + executor.copy_from_workspace("/nemo-workspace/remote", "/local/dir") + assert rsync.call_args.args[1:] == ("/nemo-workspace/remote", "/local/dir") + delete.assert_called_once() + + def test_copy_workspace_noop_without_pvc(self, executor, mock_k8s_clients): + executor.workdir_pvc = None + with patch.object(executor, "_start_data_mover_pod") as start: + executor.copy_to_workspace("/a", "/b") + executor.copy_from_workspace("/b", "/a") + start.assert_not_called() + + def test_copy_from_workspace_cleans_up_pod_on_error(self, executor, mock_k8s_clients): + executor.workdir_pvc = "model-cache" + with ( + patch.object(executor, "_start_data_mover_pod"), + patch.object(executor, "_rsync_from_pod", side_effect=RuntimeError("absent")), + patch.object(executor, "_delete_data_mover_pod") as delete, + ): + with pytest.raises(RuntimeError): + executor.copy_from_workspace("/nemo-workspace/missing", "/local/dir") + delete.assert_called_once() # pod torn down even when the copy raises