modelscope · randydl · Nov 18, 2025 · Nov 26, 2025 · Jan 29, 2026 · Feb 12, 2026
diff --git a/.deepspeed_env b/.deepspeed_env
@@ -0,0 +1,7 @@
+NCCL_IB_DISABLE=0
+NCCL_NVLS_ENABLE=0
+NCCL_IB_GID_INDEX=3
+NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_8,mlx5_9,mlx5_10,mlx5_11
+NCCL_SOCKET_IFNAME=bond0
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+OMP_NUM_THREADS=1
diff --git a/main.py b/main.py
@@ -0,0 +1,27 @@
+import shutil
+from pathlib import Path
+from omegaconf import OmegaConf
+from swift import sft_main, SftArguments, export_main, ExportArguments
+
+
+def parse_config(path):
+    conf = OmegaConf.to_container(OmegaConf.load(path), resolve=True)
+    debug_mode = conf.pop('debug', False)
+    for key, value in conf.items():
+        if key == 'output_dir' and debug_mode:
+            value = str(Path(value).with_name('temp'))
+            shutil.rmtree(value, ignore_errors=True)
+        if isinstance(value, str) and ',' in value:
+            conf[key] = value.split(',')
+    conf.pop('stage', None)
+    conf.pop('deepspeed', None)
+    conf.pop('use_liger_kernel', None)
+    return conf
+
+
+if __name__ == '__main__':
+    # conf = parse_config('randy/demo.yaml')
+    # sft_main(SftArguments(**conf))
+
+    conf = parse_config('randy/cache_data.yaml')
+    export_main(ExportArguments(**conf))
diff --git a/pipeline.sh b/pipeline.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set_env() {
+    grep -q "^$1=" .deepspeed_env \
+        && sed -i "s|^$1=.*|$1=$2|" .deepspeed_env \
+        || echo "$1=$2" >> .deepspeed_env
+}
+
+pdsh_run() {
+    hosts=$(grep -v '^\s*#' randy/hostfile | awk 'NF {print $1}' | paste -sd,)
+    pdsh -S -R ssh -w "$hosts" "$@"
+}
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+cd "$SCRIPT_DIR" || exit 1
+
+pdsh_run "bash $(realpath randy/killer.sh)"
+
+# ./train.sh randy
+
+./train.sh randy/openbee/llava_4b_1.yaml
+./train.sh randy/openbee/llava_4b_2.yaml
+./train.sh randy/openbee/llava_4b_3.yaml
+./train.sh randy/openbee/llava_4b_4.yaml
+
+pdsh_run "/nas_train/app.e0016372/tools/train.sh"
diff --git a/randy/cache_data.yaml b/randy/cache_data.yaml
@@ -0,0 +1,21 @@
+### model
+model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-4B
+
+### method
+stage: export
+to_cached_dataset: true
+
+### dataset
+dataset: bee_training_data_stage1
+max_pixels: 2560000
+dataset_num_proc: 128
+# split_dataset_ratio: 0.01
+
+### output
+output_dir: /nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage1/cache
+
+### plugins
+external_plugins:
+  - randy/plugins/model.py
+  - randy/plugins/dataset.py
+  - randy/plugins/template.py
diff --git a/randy/dataset_info.json b/randy/dataset_info.json
@@ -0,0 +1,70 @@
+[
+  {
+    "dataset_name": "llava_pretrain_558k",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-Pretrain/llava_pretrain_558k.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_3m",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-3M/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_coyo",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/coyo/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_datacomp1b",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/datacomp1b/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_imagenet",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/imagenet/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_laioncn",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/laioncn/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_mint",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/mint/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_mid_training_obelics",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/obelics/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "omniscience",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/omniscience/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "llava_next_780k",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-NeXT-780k/llava_next_780k.jsonl"
+  },
+  {
+    "dataset_name": "llava_onevision_1_5_instruct_22m",
+    "dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Instruct-Data/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "finevision",
+    "dataset_path": "/nas_user/app.e0016372/datasets/FineVision/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "cxmt_defects",
+    "dataset_path": "/nas_user/app.e0016372/datasets/cxmt/defects/defects.jsonl"
+  },
+  {
+    "dataset_name": "bee_training_data_stage1",
+    "dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage1/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "bee_training_data_stage2",
+    "dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage2/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "honey_data_15m",
+    "dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Honey-Data-15M/merged_all.jsonl"
+  },
+  {
+    "dataset_name": "honey_data_1m",
+    "dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Honey-Data-1M/merged_all.jsonl"
+  }
+]
diff --git a/randy/demo.yaml b/randy/demo.yaml
@@ -0,0 +1,56 @@
+### model
+model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-4B
+model_type: llava_onevision1_5
+
+### method
+debug: true
+stage: sft
+tuner_type: full
+lora_rank: 16
+lora_alpha: 32
+freeze_vit: true
+freeze_llm: true
+freeze_aligner: false
+
+### optimize
+attn_impl: flash_attn
+use_liger_kernel: false
+vit_gradient_checkpointing: false
+gradient_checkpointing_kwargs: {"use_reentrant": false}
+
+### dataset
+dataset: llava_pretrain_558k
+packing: false
+max_length: 4096
+max_pixels: 2560000
+dataloader_num_workers: 8
+dataloader_persistent_workers: true
+# split_dataset_ratio: 0.01
+# load_from_cache_file: true
+
+### output
+output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-4B
+logging_steps: 10
+save_strategy: steps
+save_steps: 200
+save_total_limit: 2
+report_to: tensorboard
+create_checkpoint_symlink: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+weight_decay: 0.0
+warmup_ratio: 0.03
+learning_rate: 1e-4
+num_train_epochs: 1.0
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs: {"min_lr": 1e-6}
+torch_dtype: bfloat16
+deepspeed: zero2
+
+### plugins
+external_plugins:
+  - randy/plugins/model.py
+  - randy/plugins/dataset.py
+  - randy/plugins/template.py
diff --git a/randy/hostfile b/randy/hostfile
@@ -0,0 +1,12 @@
+10.239.2.27 slots=8
+10.239.2.26 slots=8
+10.239.2.28 slots=8
+10.239.2.30 slots=8
+10.239.2.10 slots=8
+10.239.2.11 slots=8
+10.239.2.22 slots=8
+10.239.2.24 slots=8
+# 10.239.2.12 slots=8
+# 10.239.2.29 slots=8
+# 10.239.2.13 slots=8
+# 10.239.2.20 slots=8
diff --git a/randy/killer.sh b/randy/killer.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+nvidia-smi --query-compute-apps=pid --format=csv,noheader | sort -u | while read -r pid; do
+    cmd=$(ps -p "$pid" -o args= 2>/dev/null)
+
+    if [[ "$cmd" == *"examples/llava_ov_1_5/pretrain.py"* ]]; then
+        echo "Killing PID: $pid"
+        echo "$cmd"
+        sudo kill -9 "$pid"
+    fi
+done
diff --git a/randy/llava_14b_1.yaml b/randy/llava_14b_1.yaml
@@ -0,0 +1,56 @@
+### model
+model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-14B
+model_type: llava_onevision1_5
+
+### method
+# debug: true
+stage: sft
+tuner_type: full
+lora_rank: 16
+lora_alpha: 32
+freeze_vit: true
+freeze_llm: true
+freeze_aligner: false
+
+### optimize
+attn_impl: flash_attn
+use_liger_kernel: false
+vit_gradient_checkpointing: false
+gradient_checkpointing_kwargs: {"use_reentrant": false}
+
+### dataset
+dataset: llava_pretrain_558k
+packing: false
+max_length: 4096
+max_pixels: 2560000
+dataloader_num_workers: 8
+dataloader_persistent_workers: true
+# split_dataset_ratio: 0.01
+# load_from_cache_file: true
+
+### output
+output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B
+logging_steps: 10
+save_strategy: steps
+save_steps: 200
+save_total_limit: 2
+report_to: tensorboard
+create_checkpoint_symlink: true
+
+### train
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 4
+weight_decay: 0.0
+warmup_ratio: 0.03
+learning_rate: 1e-4
+num_train_epochs: 1.0
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs: {"min_lr": 1e-6}
+torch_dtype: bfloat16
+deepspeed: zero3
+
+### plugins
+external_plugins:
+  - randy/plugins/model.py
+  - randy/plugins/dataset.py
+  - randy/plugins/template.py
diff --git a/randy/llava_14b_2.yaml b/randy/llava_14b_2.yaml
@@ -0,0 +1,63 @@
+### model
+model: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B/v0-20260416-202500/checkpoint-2181
+model_type: llava_onevision1_5
+
+### method
+# debug: true
+stage: sft
+tuner_type: full
+lora_rank: 16
+lora_alpha: 32
+freeze_vit: false
+freeze_llm: false
+freeze_aligner: false
+
+### optimize
+attn_impl: flash_attn
+use_liger_kernel: false
+vit_gradient_checkpointing: true
+gradient_checkpointing_kwargs: {"use_reentrant": false}
+
+### dataset
+cached_dataset:
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/coyo/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/datacomp1b/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/imagenet/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/laioncn/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/mint/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/obelics/train
+  - /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/omniscience/train
+packing: true
+max_length: 8192
+max_pixels: 2560000
+dataloader_num_workers: 8
+dataloader_persistent_workers: true
+# split_dataset_ratio: 0.01
+# load_from_cache_file: true
+
+### output
+output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B
+logging_steps: 10
+save_strategy: steps
+save_steps: 2000
+save_total_limit: 2
+report_to: tensorboard
+create_checkpoint_symlink: true
+
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 1
+weight_decay: 0.0
+warmup_ratio: 0.03
+learning_rate: 1e-5
+num_train_epochs: 1.0
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs: {"min_lr": 1e-6}
+torch_dtype: bfloat16
+deepspeed: zero3
+
+### plugins
+external_plugins:
+  - randy/plugins/model.py
+  - randy/plugins/dataset.py
+  - randy/plugins/template.py