-
Notifications
You must be signed in to change notification settings - Fork 659
feat: automatically fall back to VAE tiling when an untiled decode exceeds the backend buffer limit #1621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
feat: automatically fall back to VAE tiling when an untiled decode exceeds the backend buffer limit #1621
Changes from all commits
ccca2d1
0d923e7
2227503
d8f723b
1f3d27a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1694,6 +1694,12 @@ struct GGMLRunner { | |
|
|
||
| ggml_context* compute_ctx = nullptr; | ||
| ggml_gallocr* compute_allocr = nullptr; | ||
| // Set true when alloc_compute_buffer() deliberately defers to tiling (the | ||
| // proactive probe found the untiled buffer exceeds the backend's max single | ||
| // buffer). Lets callers skip the "alloc compute buffer failed" ERROR on this | ||
| // expected, successfully-handled path (VAE auto-tiling); a genuine OOM from | ||
| // the real reserve still logs as before. | ||
| bool compute_buffer_deferred_to_tiling = false; | ||
|
|
||
| ggml_context* partial_offload_ctx = nullptr; | ||
| ggml_backend_buffer_t partial_runtime_params_buffer = nullptr; | ||
|
|
@@ -1710,6 +1716,12 @@ struct GGMLRunner { | |
| bool stream_layers_enabled = false; | ||
| size_t observed_max_effective_budget_ = 0; | ||
|
|
||
| // When set, alloc_compute_buffer first measures the graph's planned compute | ||
| // buffer size (no allocation) and bails before allocating if it exceeds the | ||
| // backend's max single-buffer size. Used by VAE AUTO tiling to fall back to | ||
| // tiling proactively instead of attempting (and failing) a too-large decode. | ||
| bool probe_compute_buffer_fits_ = false; | ||
|
|
||
| sd::layer_registry::LayerRegistry layer_registry_; | ||
|
|
||
| std::shared_ptr<WeightAdapter> weight_adapter = nullptr; | ||
|
|
@@ -1895,10 +1907,39 @@ struct GGMLRunner { | |
| } | ||
|
|
||
| bool alloc_compute_buffer(ggml_cgraph* gf) { | ||
| compute_buffer_deferred_to_tiling = false; | ||
| if (compute_allocr != nullptr) { | ||
| return true; | ||
| } | ||
| compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); | ||
| ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend); | ||
|
|
||
| if (probe_compute_buffer_fits_) { | ||
| // Measure the planned compute buffer WITHOUT allocating (no_alloc | ||
| // planning) and bail before the real reserve if it exceeds the | ||
| // backend's max single-buffer size. This lets the caller (VAE AUTO | ||
| // tiling) fall back to tiling without the backend ever emitting its | ||
| // raw "allocation failed" error on the successful auto path. A | ||
| // genuine runtime OOM (planned size <= max, but the device is full) | ||
| // is NOT caught here -- it still surfaces from the real reserve | ||
| // below, so the reactive fallback remains the backstop. | ||
| size_t max_size = ggml_backend_buft_get_max_size(buft); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the size calculation is needlessly including the VAE weights here. By default I get:
Fiddling with
But this may be kind of a moot point, at least on Vulkan: as far as I can tell by looking at the ggml code, the limit on Vulkan will by default be capped at 1G anyway ( |
||
| if (max_size > 0) { | ||
| ggml_gallocr* probe = ggml_gallocr_new(buft); | ||
| size_t sizes[1] = {0}; | ||
| ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes); | ||
| ggml_gallocr_free(probe); | ||
| if (sizes[0] > max_size) { | ||
| LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling", | ||
| get_desc().c_str(), | ||
| sizes[0] / 1024.0 / 1024.0, | ||
| max_size / 1024.0 / 1024.0); | ||
| compute_buffer_deferred_to_tiling = true; | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| compute_allocr = ggml_gallocr_new(buft); | ||
|
|
||
| if (!ggml_gallocr_reserve(compute_allocr, gf)) { | ||
| // failed to allocate the compute buffer | ||
|
|
@@ -2634,7 +2675,9 @@ struct GGMLRunner { | |
|
|
||
| int64_t t_alloc_begin = ggml_time_ms(); | ||
| if (!alloc_compute_buffer(gf)) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| if (!compute_buffer_deferred_to_tiling) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| } | ||
| if (use_partial_param_offload) { | ||
| restore_partial_params(); | ||
| } | ||
|
|
@@ -3188,7 +3231,9 @@ struct GGMLRunner { | |
| } | ||
| } | ||
| if (!alloc_compute_buffer(gf)) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| if (!compute_buffer_deferred_to_tiling) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| } | ||
| return std::nullopt; | ||
| } | ||
| return execute_graph<T>(gf, | ||
|
|
@@ -3224,6 +3269,14 @@ struct GGMLRunner { | |
| stream_layers_enabled = enabled; | ||
| } | ||
|
|
||
| // When enabled, the next compute() measures its planned compute buffer and | ||
| // declines to allocate (returning failure) if it would exceed the backend's | ||
| // max single-buffer size, instead of attempting the allocation and emitting | ||
| // the backend's raw error. See probe_compute_buffer_fits_. | ||
| void set_probe_compute_buffer_fits(bool enabled) { | ||
| probe_compute_buffer_fits_ = enabled; | ||
| } | ||
|
|
||
| sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } | ||
|
|
||
| ggml_backend_t get_runtime_backend() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these kind of detailed comments would be fine if the file already had detailed comments everywhere else.
Also (just suggestions, I don't know what @leejet would prefer): maybe we could use an
extra_tiling_argsparameter instead of a separate flag, since it'd be more useful as a workaround or for testing?And rather than a simple on/off switch, maybe we could receive a threshold override here? Say, -1 for disabling, 0 for auto, > 0 as the new limit. That way, the user could increase it if they know the device can handle it, decrease it to save VRAM for other reasons, etc (working around the Vulkan 1G limit would be an immediate use case, too).