Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .deepspeed_env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
NCCL_IB_DISABLE=0
NCCL_NVLS_ENABLE=0
NCCL_IB_GID_INDEX=3
NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_8,mlx5_9,mlx5_10,mlx5_11
NCCL_SOCKET_IFNAME=bond0
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
OMP_NUM_THREADS=1
27 changes: 27 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import shutil
from pathlib import Path
from omegaconf import OmegaConf
from swift import sft_main, SftArguments, export_main, ExportArguments


def parse_config(path):
conf = OmegaConf.to_container(OmegaConf.load(path), resolve=True)
debug_mode = conf.pop('debug', False)
for key, value in conf.items():
if key == 'output_dir' and debug_mode:
value = str(Path(value).with_name('temp'))
shutil.rmtree(value, ignore_errors=True)
if isinstance(value, str) and ',' in value:
conf[key] = value.split(',')
conf.pop('stage', None)
conf.pop('deepspeed', None)
conf.pop('use_liger_kernel', None)
return conf


if __name__ == '__main__':
# conf = parse_config('randy/demo.yaml')
# sft_main(SftArguments(**conf))

conf = parse_config('randy/cache_data.yaml')
export_main(ExportArguments(**conf))
26 changes: 26 additions & 0 deletions pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

set_env() {
grep -q "^$1=" .deepspeed_env \
&& sed -i "s|^$1=.*|$1=$2|" .deepspeed_env \
|| echo "$1=$2" >> .deepspeed_env
}

pdsh_run() {
hosts=$(grep -v '^\s*#' randy/hostfile | awk 'NF {print $1}' | paste -sd,)
pdsh -S -R ssh -w "$hosts" "$@"
}

SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
cd "$SCRIPT_DIR" || exit 1

pdsh_run "bash $(realpath randy/killer.sh)"

# ./train.sh randy

./train.sh randy/openbee/llava_4b_1.yaml
./train.sh randy/openbee/llava_4b_2.yaml
./train.sh randy/openbee/llava_4b_3.yaml
./train.sh randy/openbee/llava_4b_4.yaml

pdsh_run "/nas_train/app.e0016372/tools/train.sh"
21 changes: 21 additions & 0 deletions randy/cache_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
### model
model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-4B

### method
stage: export
to_cached_dataset: true

### dataset
dataset: bee_training_data_stage1
max_pixels: 2560000
dataset_num_proc: 128
# split_dataset_ratio: 0.01

### output
output_dir: /nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage1/cache

### plugins
external_plugins:
- randy/plugins/model.py
- randy/plugins/dataset.py
- randy/plugins/template.py
70 changes: 70 additions & 0 deletions randy/dataset_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
[
{
"dataset_name": "llava_pretrain_558k",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-Pretrain/llava_pretrain_558k.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_3m",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-3M/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_coyo",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/coyo/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_datacomp1b",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/datacomp1b/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_imagenet",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/imagenet/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_laioncn",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/laioncn/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_mint",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/mint/merged_all.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_mid_training_obelics",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/obelics/merged_all.jsonl"
},
{
"dataset_name": "omniscience",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/omniscience/merged_all.jsonl"
},
{
"dataset_name": "llava_next_780k",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-NeXT-780k/llava_next_780k.jsonl"
},
{
"dataset_name": "llava_onevision_1_5_instruct_22m",
"dataset_path": "/nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Instruct-Data/merged_all.jsonl"
},
{
"dataset_name": "finevision",
"dataset_path": "/nas_user/app.e0016372/datasets/FineVision/merged_all.jsonl"
},
{
"dataset_name": "cxmt_defects",
"dataset_path": "/nas_user/app.e0016372/datasets/cxmt/defects/defects.jsonl"
},
{
"dataset_name": "bee_training_data_stage1",
"dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage1/merged_all.jsonl"
},
{
"dataset_name": "bee_training_data_stage2",
"dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Bee-Training-Data-Stage2/merged_all.jsonl"
},
{
"dataset_name": "honey_data_15m",
"dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Honey-Data-15M/merged_all.jsonl"
},
{
"dataset_name": "honey_data_1m",
"dataset_path": "/nas_user/app.e0016372/datasets/Open-Bee/Honey-Data-1M/merged_all.jsonl"
}
]
56 changes: 56 additions & 0 deletions randy/demo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
### model
model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-4B
model_type: llava_onevision1_5

### method
debug: true
stage: sft
tuner_type: full
lora_rank: 16
lora_alpha: 32
freeze_vit: true
freeze_llm: true
freeze_aligner: false

### optimize
attn_impl: flash_attn
use_liger_kernel: false
vit_gradient_checkpointing: false
gradient_checkpointing_kwargs: {"use_reentrant": false}

### dataset
dataset: llava_pretrain_558k
packing: false
max_length: 4096
max_pixels: 2560000
dataloader_num_workers: 8
dataloader_persistent_workers: true
# split_dataset_ratio: 0.01
# load_from_cache_file: true

### output
output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-4B
logging_steps: 10
save_strategy: steps
save_steps: 200
save_total_limit: 2
report_to: tensorboard
create_checkpoint_symlink: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
weight_decay: 0.0
warmup_ratio: 0.03
learning_rate: 1e-4
num_train_epochs: 1.0
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs: {"min_lr": 1e-6}
torch_dtype: bfloat16
deepspeed: zero2

### plugins
external_plugins:
- randy/plugins/model.py
- randy/plugins/dataset.py
- randy/plugins/template.py
12 changes: 12 additions & 0 deletions randy/hostfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
10.239.2.27 slots=8
10.239.2.26 slots=8
10.239.2.28 slots=8
10.239.2.30 slots=8
10.239.2.10 slots=8
10.239.2.11 slots=8
10.239.2.22 slots=8
10.239.2.24 slots=8
# 10.239.2.12 slots=8
# 10.239.2.29 slots=8
# 10.239.2.13 slots=8
# 10.239.2.20 slots=8
11 changes: 11 additions & 0 deletions randy/killer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

nvidia-smi --query-compute-apps=pid --format=csv,noheader | sort -u | while read -r pid; do
cmd=$(ps -p "$pid" -o args= 2>/dev/null)

if [[ "$cmd" == *"examples/llava_ov_1_5/pretrain.py"* ]]; then
echo "Killing PID: $pid"
echo "$cmd"
sudo kill -9 "$pid"
fi
done
56 changes: 56 additions & 0 deletions randy/llava_14b_1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
### model
model: /nas_train/app.e0016372/models/lmms-lab/LLaVA-OneVision-1.5-14B
model_type: llava_onevision1_5

### method
# debug: true
stage: sft
tuner_type: full
lora_rank: 16
lora_alpha: 32
freeze_vit: true
freeze_llm: true
freeze_aligner: false

### optimize
attn_impl: flash_attn
use_liger_kernel: false
vit_gradient_checkpointing: false
gradient_checkpointing_kwargs: {"use_reentrant": false}

### dataset
dataset: llava_pretrain_558k
packing: false
max_length: 4096
max_pixels: 2560000
dataloader_num_workers: 8
dataloader_persistent_workers: true
# split_dataset_ratio: 0.01
# load_from_cache_file: true

### output
output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B
logging_steps: 10
save_strategy: steps
save_steps: 200
save_total_limit: 2
report_to: tensorboard
create_checkpoint_symlink: true

### train
per_device_train_batch_size: 8
gradient_accumulation_steps: 4
weight_decay: 0.0
warmup_ratio: 0.03
learning_rate: 1e-4
num_train_epochs: 1.0
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs: {"min_lr": 1e-6}
torch_dtype: bfloat16
deepspeed: zero3

### plugins
external_plugins:
- randy/plugins/model.py
- randy/plugins/dataset.py
- randy/plugins/template.py
63 changes: 63 additions & 0 deletions randy/llava_14b_2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
### model
model: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B/v0-20260416-202500/checkpoint-2181
model_type: llava_onevision1_5

### method
# debug: true
stage: sft
tuner_type: full
lora_rank: 16
lora_alpha: 32
freeze_vit: false
freeze_llm: false
freeze_aligner: false

### optimize
attn_impl: flash_attn
use_liger_kernel: false
vit_gradient_checkpointing: true
gradient_checkpointing_kwargs: {"use_reentrant": false}

### dataset
cached_dataset:
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/coyo/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/datacomp1b/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/imagenet/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/laioncn/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/mint/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/obelics/train
- /nas_user/app.e0016372/datasets/LLaVA-OneVision-1.5-Mid-Training-85M/cache/omniscience/train
packing: true
max_length: 8192
max_pixels: 2560000
dataloader_num_workers: 8
dataloader_persistent_workers: true
# split_dataset_ratio: 0.01
# load_from_cache_file: true

### output
output_dir: /nas_train/app.e0016372/train/sft/full/LLaVA-OneVision-1.5-14B
logging_steps: 10
save_strategy: steps
save_steps: 2000
save_total_limit: 2
report_to: tensorboard
create_checkpoint_symlink: true

### train
per_device_train_batch_size: 2
gradient_accumulation_steps: 1
weight_decay: 0.0
warmup_ratio: 0.03
learning_rate: 1e-5
num_train_epochs: 1.0
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs: {"min_lr": 1e-6}
torch_dtype: bfloat16
deepspeed: zero3

### plugins
external_plugins:
- randy/plugins/model.py
- randy/plugins/dataset.py
- randy/plugins/template.py
Loading
Loading