diff --git a/docs/comparison/orig_cat.png b/docs/comparison/orig_cat.png new file mode 100644 index 000000000..71c1d2c1e Binary files /dev/null and b/docs/comparison/orig_cat.png differ diff --git a/docs/comparison/orig_garden.png b/docs/comparison/orig_garden.png new file mode 100644 index 000000000..100d91293 Binary files /dev/null and b/docs/comparison/orig_garden.png differ diff --git a/docs/comparison/orig_phonograph.png b/docs/comparison/orig_phonograph.png new file mode 100644 index 000000000..5d62e14ce Binary files /dev/null and b/docs/comparison/orig_phonograph.png differ diff --git a/docs/comparison/rmse1pct_cat.png b/docs/comparison/rmse1pct_cat.png new file mode 100644 index 000000000..965bc96e4 Binary files /dev/null and b/docs/comparison/rmse1pct_cat.png differ diff --git a/docs/comparison/rmse1pct_garden.png b/docs/comparison/rmse1pct_garden.png new file mode 100644 index 000000000..1a67720a3 Binary files /dev/null and b/docs/comparison/rmse1pct_garden.png differ diff --git a/docs/comparison/rmse1pct_phonograph.png b/docs/comparison/rmse1pct_phonograph.png new file mode 100644 index 000000000..955a67f1a Binary files /dev/null and b/docs/comparison/rmse1pct_phonograph.png differ diff --git a/docs/comparison/rmse3pct_cat.png b/docs/comparison/rmse3pct_cat.png new file mode 100644 index 000000000..e5457f59b Binary files /dev/null and b/docs/comparison/rmse3pct_cat.png differ diff --git a/docs/comparison/rmse3pct_garden.png b/docs/comparison/rmse3pct_garden.png new file mode 100644 index 000000000..9851abae8 Binary files /dev/null and b/docs/comparison/rmse3pct_garden.png differ diff --git a/docs/comparison/rmse3pct_phonograph.png b/docs/comparison/rmse3pct_phonograph.png new file mode 100644 index 000000000..4cca97011 Binary files /dev/null and b/docs/comparison/rmse3pct_phonograph.png differ diff --git a/docs/comparison/rmse6pct_cat.png b/docs/comparison/rmse6pct_cat.png new file mode 100644 index 000000000..ac00b9801 Binary files /dev/null and b/docs/comparison/rmse6pct_cat.png differ diff --git a/docs/comparison/rmse6pct_garden.png b/docs/comparison/rmse6pct_garden.png new file mode 100644 index 000000000..489255f5e Binary files /dev/null and b/docs/comparison/rmse6pct_garden.png differ diff --git a/docs/comparison/rmse6pct_phonograph.png b/docs/comparison/rmse6pct_phonograph.png new file mode 100644 index 000000000..a8c5b956e Binary files /dev/null and b/docs/comparison/rmse6pct_phonograph.png differ diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index dc5013b3e..7689b5d5d 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -41,6 +41,7 @@ struct SDCliParams { bool verbose = false; bool canny_preprocess = false; bool convert_name = false; + float rmse_threshold = 0.0f; preview_t preview_method = PREVIEW_NONE; int preview_interval = 1; @@ -88,6 +89,16 @@ struct SDCliParams { &output_begin_idx}, }; + options.float_options = { + {"", + "--rmse", + "maximum relative RMSE per tensor for auto mixed-precision quantization in convert mode " + "(e.g. 0.03 = 3%). Sweeps from coarsest to finest quant type and picks the coarsest " + "that stays under this budget. --type sets the quality ceiling (default: f16). " + "Explicit --tensor-type-rules take priority over the RMSE sweep.", + &rmse_threshold}, + }; + options.bool_options = { {"", "--canny", @@ -601,23 +612,16 @@ int main(int argc, const char* argv[]) { LOG_DEBUG("%s", gen_params.to_string().c_str()); if (cli_params.mode == CONVERT) { - bool success = convert(ctx_params.model_path.c_str(), - ctx_params.vae_path.c_str(), + sd_ctx_params_t sd_params = ctx_params.to_sd_ctx_params_t(false, false, false); + bool success = convert(&sd_params, cli_params.output_path.c_str(), - ctx_params.wtype, - ctx_params.tensor_type_rules.c_str(), - cli_params.convert_name); + cli_params.convert_name, + cli_params.rmse_threshold); if (!success) { - LOG_ERROR("convert '%s'/'%s' to '%s' failed", - ctx_params.model_path.c_str(), - ctx_params.vae_path.c_str(), - cli_params.output_path.c_str()); + LOG_ERROR("convert to '%s' failed", cli_params.output_path.c_str()); return 1; } else { - LOG_INFO("convert '%s'/'%s' to '%s' success", - ctx_params.model_path.c_str(), - ctx_params.vae_path.c_str(), - cli_params.output_path.c_str()); + LOG_INFO("convert to '%s' success", cli_params.output_path.c_str()); return 0; } } diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 519e8aae6..b2793d4e7 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -430,6 +430,10 @@ ArgOptions SDContextParams::get_options() { "--mmap", "whether to memory-map model", true, &enable_mmap}, + {"-ll", + "--lazy-load", + "staged loading: evict text encoders from RAM after encoding, diffusion model after sampling, VAE after decoding (forces --mmap on)", + true, &lazy_loading}, {"", "--control-net-cpu", "keep controlnet in cpu (for low vram)", @@ -697,6 +701,7 @@ std::string SDContextParams::to_string() const { << " backend: \"" << backend << "\",\n" << " params_backend: \"" << params_backend << "\",\n" << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" + << " lazy_loading: " << (lazy_loading ? "true" : "false") << ",\n" << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" @@ -773,6 +778,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f chroma_t5_mask_pad, qwen_image_zero_cond_t, max_vram, + lazy_loading, backend.c_str(), params_backend.c_str(), }; diff --git a/examples/common/common.h b/examples/common/common.h index ca367f7ee..061a2c5d5 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -115,6 +115,7 @@ struct SDContextParams { std::string backend; std::string params_backend; bool enable_mmap = false; + bool lazy_loading = false; bool control_net_cpu = false; bool clip_on_cpu = false; bool vae_on_cpu = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index f8b2c2f59..3d35e03f5 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -213,6 +213,7 @@ typedef struct { int chroma_t5_mask_pad; bool qwen_image_zero_cond_t; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) + bool lazy_loading; // staged load: encode text, evict text encoder, load diffusion, evict, load VAE, decode const char* backend; const char* params_backend; } sd_ctx_params_t; @@ -463,12 +464,10 @@ SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx); -SD_API bool convert(const char* input_path, - const char* vae_path, +SD_API bool convert(const sd_ctx_params_t* params, const char* output_path, - enum sd_type_t output_type, - const char* tensor_type_rules, - bool convert_name); + bool convert_name, + float rmse_threshold); SD_API bool preprocess_canny(sd_image_t image, float high_threshold, diff --git a/src/conditioner.hpp b/src/conditioner.hpp index f08feeef7..44aa5a6db 100644 --- a/src/conditioner.hpp +++ b/src/conditioner.hpp @@ -115,6 +115,7 @@ struct Conditioner { const ConditionerParams& conditioner_params) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; + virtual void free_compute_buffer() {} virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {} @@ -805,6 +806,18 @@ struct SD3CLIPEmbedder : public Conditioner { } } + void free_compute_buffer() override { + if (clip_l) { + clip_l->free_compute_buffer(); + } + if (clip_g) { + clip_g->free_compute_buffer(); + } + if (t5) { + t5->free_compute_buffer(); + } + } + size_t get_params_buffer_size() override { size_t buffer_size = 0; if (clip_l) { diff --git a/src/convert.cpp b/src/convert.cpp index 7cae8df0f..4057bd7a5 100644 --- a/src/convert.cpp +++ b/src/convert.cpp @@ -1,138 +1,548 @@ -#include -#include -#include -#include - -#include "model.h" -#include "model_io/gguf_io.h" -#include "model_io/safetensors_io.h" -#include "util.h" - -#include "ggml-cpu.h" - -static ggml_type get_export_tensor_type(ModelLoader& model_loader, - const TensorStorage& tensor_storage, - ggml_type type, - const TensorTypeRules& tensor_type_rules) { - const std::string& name = tensor_storage.name; - ggml_type tensor_type = tensor_storage.type; - ggml_type dst_type = type; - - for (const auto& tensor_type_rule : tensor_type_rules) { - std::regex pattern(tensor_type_rule.first); - if (std::regex_search(name, pattern)) { - dst_type = tensor_type_rule.second; - break; - } - } - - if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) { - tensor_type = dst_type; - } - - return tensor_type; -} - -static bool load_tensors_for_export(ModelLoader& model_loader, - ggml_context* ggml_ctx, - ggml_type type, - const TensorTypeRules& tensor_type_rules, - std::vector& tensors) { - std::mutex tensor_mutex; - auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { - const std::string& name = tensor_storage.name; - ggml_type tensor_type = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules); - - std::lock_guard lock(tensor_mutex); - ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); - if (tensor == nullptr) { - LOG_ERROR("ggml_new_tensor failed"); - return false; - } - ggml_set_name(tensor, name.c_str()); - - if (!tensor->data) { - GGML_ASSERT(ggml_nelements(tensor) == 0); - // Avoid crashing writers by setting a dummy pointer for zero-sized tensors. - LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); - tensor->data = ggml_get_mem_buffer(ggml_ctx); - } - - TensorWriteInfo write_info; - write_info.tensor = tensor; - write_info.n_dims = tensor_storage.n_dims; - for (int i = 0; i < tensor_storage.n_dims; ++i) { - write_info.ne[i] = tensor_storage.ne[i]; - } - - *dst_tensor = tensor; - tensors.push_back(std::move(write_info)); - - return true; - }; - - bool success = model_loader.load_tensors(on_new_tensor_cb); - LOG_INFO("load tensors done"); - return success; -} - -bool convert(const char* input_path, - const char* vae_path, - const char* output_path, - sd_type_t output_type, - const char* tensor_type_rules, - bool convert_name) { - ModelLoader model_loader; - - if (!model_loader.init_from_file(input_path)) { - LOG_ERROR("init model loader from file failed: '%s'", input_path); - return false; - } - - if (vae_path != nullptr && strlen(vae_path) > 0) { - if (!model_loader.init_from_file(vae_path, "vae.")) { - LOG_ERROR("init model loader from file failed: '%s'", vae_path); - return false; - } - } - if (convert_name) { - model_loader.convert_tensors_name(); - } - - ggml_type type = (ggml_type)output_type; - bool output_is_safetensors = ends_with(output_path, ".safetensors"); - TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules); - - auto backend = ggml_backend_cpu_init(); - size_t mem_size = 1 * 1024 * 1024; // for padding - mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead(); - mem_size += model_loader.get_params_mem_size(backend, type); - LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); - ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false}); - - if (ggml_ctx == nullptr) { - LOG_ERROR("ggml_init failed for converter"); - ggml_backend_free(backend); - return false; - } - - std::vector tensors; - bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors); - ggml_backend_free(backend); - - std::string error; - if (success) { - if (output_is_safetensors) { - success = write_safetensors_file(output_path, tensors, &error); - } else { - success = write_gguf_file(output_path, tensors, &error); - } - } - - if (!success && !error.empty()) { - LOG_ERROR("%s", error.c_str()); - } - - ggml_free(ggml_ctx); - return success; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "model.h" +#include "model_io/gguf_io.h" +#include "model_io/safetensors_io.h" +#include "stable-diffusion.h" +#include "util.h" + +#include "ggml-cpu.h" +#include "gguf.h" + +#ifndef SAFE_STR +#define SAFE_STR(s) ((s) ? (s) : "") +#endif + +// Candidate types for RMSE sweep, ordered coarsest to finest. +// find_best_type_for_rmse stops at the ceiling type, so order matters. +static const ggml_type RMSE_CANDIDATES[] = { + GGML_TYPE_Q2_K, + GGML_TYPE_Q3_K, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0, + GGML_TYPE_F16, +}; +static const int N_RMSE_CANDIDATES = (int)(sizeof(RMSE_CANDIDATES) / sizeof(RMSE_CANDIDATES[0])); + +// Returns ||orig - recon||_2 / ||orig||_2, i.e. relative RMSE. +static float compute_relative_rmse(const float* orig, const float* recon, int64_t n) { + double sum_orig_sq = 0.0, sum_delta_sq = 0.0; + for (int64_t i = 0; i < n; i++) { + sum_orig_sq += (double)orig[i] * orig[i]; + double d = (double)orig[i] - (double)recon[i]; + sum_delta_sq += d * d; + } + if (sum_orig_sq == 0.0) return 0.0f; + return (float)std::sqrt(sum_delta_sq / sum_orig_sq); +} + +// Sweep RMSE_CANDIDATES from coarsest to finest up to ceiling_type. +// Returns the coarsest type whose relative RMSE <= threshold. +// If no candidate meets the threshold, returns ceiling_type. +static ggml_type find_best_type_for_rmse(const float* data, + int64_t nrows, + int64_t n_per_row, + ggml_type ceiling_type, + float threshold) { + int64_t n = nrows * n_per_row; + std::vector quant_buf; + std::vector recon(n); + std::vector imatrix(n_per_row, 1.0f); + + for (int ci = 0; ci < N_RMSE_CANDIDATES; ci++) { + ggml_type ctype = RMSE_CANDIDATES[ci]; + + // Skip candidates coarser than the ceiling (shouldn't happen with ordered list, + // but guard in case ceiling is before the end of the array). + bool at_ceiling = (ctype == ceiling_type); + + if (ggml_is_quantized(ctype) && n_per_row % ggml_blck_size(ctype) != 0) { + if (at_ceiling) return ceiling_type; + continue; + } + + size_t qsize; + if (ctype == GGML_TYPE_F16) { + qsize = (size_t)n * sizeof(ggml_fp16_t); + } else { + qsize = (size_t)nrows * ggml_row_size(ctype, n_per_row); + } + quant_buf.resize(qsize); + + if (ctype == GGML_TYPE_F16) { + ggml_fp32_to_fp16_row(data, (ggml_fp16_t*)quant_buf.data(), n); + ggml_fp16_to_fp32_row((ggml_fp16_t*)quant_buf.data(), recon.data(), n); + } else { + const ggml_type_traits* traits = ggml_get_type_traits(ctype); + if (traits->to_float == nullptr) { + if (at_ceiling) return ceiling_type; + continue; + } + ggml_quantize_chunk(ctype, data, quant_buf.data(), 0, nrows, n_per_row, imatrix.data()); + traits->to_float(quant_buf.data(), recon.data(), n); + } + + float rmse = compute_relative_rmse(data, recon.data(), n); + if (rmse <= threshold) return ctype; + if (at_ceiling) return ceiling_type; + } + + return ceiling_type; +} + +// ─── Normal (non-RMSE) export path ──────────────────────────────────────────── + +static ggml_type get_export_tensor_type(ModelLoader& model_loader, + const TensorStorage& tensor_storage, + ggml_type type, + const TensorTypeRules& tensor_type_rules) { + const std::string& name = tensor_storage.name; + ggml_type tensor_type = tensor_storage.type; + ggml_type dst_type = type; + + for (const auto& rule : tensor_type_rules) { + std::regex pattern(rule.first); + if (std::regex_search(name, pattern)) { + dst_type = rule.second; + break; + } + } + + if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) { + tensor_type = dst_type; + } + + return tensor_type; +} + +static bool load_tensors_for_export(ModelLoader& model_loader, + ggml_context* ggml_ctx, + ggml_type type, + const TensorTypeRules& tensor_type_rules, + std::vector& tensors) { + std::mutex tensor_mutex; + auto on_new_tensor_cb = [&](const TensorStorage& ts, ggml_tensor** dst_tensor) -> bool { + ggml_type tensor_type = get_export_tensor_type(model_loader, ts, type, tensor_type_rules); + + std::lock_guard lock(tensor_mutex); + ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, ts.n_dims, ts.ne); + if (tensor == nullptr) { + LOG_ERROR("ggml_new_tensor failed"); + return false; + } + ggml_set_name(tensor, ts.name.c_str()); + + if (!tensor->data) { + GGML_ASSERT(ggml_nelements(tensor) == 0); + tensor->data = ggml_get_mem_buffer(ggml_ctx); + } + + TensorWriteInfo wi; + wi.tensor = tensor; + wi.n_dims = ts.n_dims; + for (int i = 0; i < ts.n_dims; ++i) wi.ne[i] = ts.ne[i]; + + *dst_tensor = tensor; + tensors.push_back(std::move(wi)); + return true; + }; + + bool success = model_loader.load_tensors(on_new_tensor_cb); + LOG_INFO("load tensors done"); + return success; +} + +// ─── RMSE export path (streaming two-pass, low RAM) ────────────────────────── +// +// Pass 1: enumerate active tensors (null-dst callback, no data loaded), then +// for each tensor: load as f32 → RMSE sweep → record target type → free. +// Peak RAM = f32 size of the single largest tensor. +// +// Pass 2: write GGUF header (only_meta), then for each tensor: load as f32 → +// quantize → write bytes at the correct file offset → free. +// Same tiny peak RAM. + +static ggml_type pick_target_type(const TensorStorage& ts, + ModelLoader& model_loader, + const TensorTypeRules& tensor_type_rules, + ggml_type ceiling_type, + float rmse_threshold, + const float* data, + int64_t n_per_row, + int64_t nrows) { + for (const auto& rule : tensor_type_rules) { + std::regex pattern(rule.first); + if (std::regex_search(ts.name, pattern)) return rule.second; + } + + if (ceiling_type != GGML_TYPE_COUNT && + model_loader.tensor_should_be_converted(ts, ceiling_type)) { + if (data != nullptr) { + return find_best_type_for_rmse(data, nrows, n_per_row, ceiling_type, rmse_threshold); + } + return ceiling_type; + } + return GGML_TYPE_F16; +} + +static bool convert_rmse_streaming(ModelLoader& model_loader, + ggml_type ceiling_type, + const TensorTypeRules& tensor_type_rules, + float rmse_threshold, + const std::string& output_path, + std::string* error) { + // Step 1: collect active tensor list without loading any data. + // load_tensors uses a thread pool, so the callback is called concurrently. + std::vector active; + { + std::mutex active_mtx; + auto cb = [&](const TensorStorage& ts, ggml_tensor** dst) -> bool { + std::lock_guard lk(active_mtx); + active.push_back(ts); + *dst = nullptr; + return true; + }; + if (!model_loader.load_tensors(cb)) { + if (error) *error = "failed to enumerate tensors"; + return false; + } + } + LOG_INFO("RMSE sweep: %zu active tensors", active.size()); + + // Step 2: type sweep — one tensor in RAM at a time. + std::vector target_types(active.size(), GGML_TYPE_F16); + std::vector f32_buf; + + int n_threads = std::max(1, (int)sd_get_num_physical_cores()); + std::atomic next_idx{0}; + std::mutex type_write_mutex; + std::vector workers; + workers.reserve(n_threads); + + // Worker threads each load their own tensor independently (different file offsets). + auto worker_fn = [&]() { + std::vector local_buf; + size_t i; + while ((i = next_idx.fetch_add(1, std::memory_order_relaxed)) < active.size()) { + const TensorStorage& ts = active[i]; + int64_t n = ts.nelements(); + int64_t n_per_row = ts.ne[0]; + int64_t nrows = n / std::max(n_per_row, (int64_t)1); + + ggml_type ttype; + if (n == 0 || !model_loader.tensor_should_be_converted(ts, ceiling_type)) { + // Skip RMSE sweep for non-weight tensors; pick_target_type handles rules/F16. + ttype = pick_target_type(ts, model_loader, tensor_type_rules, + ceiling_type, rmse_threshold, nullptr, n_per_row, nrows); + } else { + if (!model_loader.load_tensor_f32(ts, local_buf)) { + LOG_WARN("RMSE sweep: failed to load '%s', defaulting to f16", ts.name.c_str()); + ttype = GGML_TYPE_F16; + } else { + ttype = pick_target_type(ts, model_loader, tensor_type_rules, + ceiling_type, rmse_threshold, + local_buf.data(), n_per_row, nrows); + } + } + std::lock_guard lk(type_write_mutex); + target_types[i] = ttype; + } + }; + + for (int t = 0; t < n_threads; t++) workers.emplace_back(worker_fn); + for (auto& w : workers) w.join(); + LOG_INFO("RMSE sweep: type selection done on %d threads", n_threads); + + // Step 3: build GGUF header (no_alloc — metadata only, no tensor data in RAM). + size_t meta_mem = 1 * 1024 * 1024 + active.size() * ggml_tensor_overhead(); + ggml_context* meta_ctx = ggml_init({meta_mem, nullptr, true}); + if (!meta_ctx) { + if (error) *error = "ggml_init failed for meta context"; + return false; + } + + gguf_context* gguf_ctx = gguf_init_empty(); + if (!gguf_ctx) { + ggml_free(meta_ctx); + if (error) *error = "gguf_init_empty failed"; + return false; + } + + std::map type_tensor_count; + std::map type_byte_size; + size_t total_f16_bytes = 0; + + for (size_t i = 0; i < active.size(); i++) { + const TensorStorage& ts = active[i]; + ggml_type ttype = target_types[i]; + int64_t n = ts.nelements(); + + ggml_tensor* t = ggml_new_tensor(meta_ctx, ttype, ts.n_dims, ts.ne); + if (!t) { + gguf_free(gguf_ctx); + ggml_free(meta_ctx); + if (error) *error = "ggml_new_tensor failed for '" + ts.name + "'"; + return false; + } + ggml_set_name(t, ts.name.c_str()); + gguf_add_tensor(gguf_ctx, t); + + if (n > 0) { + total_f16_bytes += (size_t)n * sizeof(ggml_fp16_t); + size_t out_bytes; + if (ttype == GGML_TYPE_F32) { + out_bytes = (size_t)n * sizeof(float); + } else if (ttype == GGML_TYPE_F16) { + out_bytes = (size_t)n * sizeof(ggml_fp16_t); + } else { + int64_t n_per_row = ts.ne[0]; + int64_t nrows = n / n_per_row; + out_bytes = (size_t)nrows * ggml_row_size(ttype, n_per_row); + } + type_tensor_count[ttype]++; + type_byte_size[ttype] += out_bytes; + } + } + + // Print summary table. + size_t total_out_bytes = 0; + for (auto& [t, b] : type_byte_size) total_out_bytes += b; + LOG_INFO("---- RMSE mixed-quant summary (threshold %.1f%%) ----", rmse_threshold * 100.0f); + LOG_INFO(" %-12s %8s %10s %6s", "type", "tensors", "size (MB)", "share"); + for (auto& [t, count] : type_tensor_count) { + size_t mb = type_byte_size[t] / (1024 * 1024); + float pct = total_out_bytes > 0 + ? (float)type_byte_size[t] * 100.0f / (float)total_out_bytes + : 0.0f; + LOG_INFO(" %-12s %8d %10zu %5.1f%%", ggml_type_name(t), count, mb, pct); + } + float ratio = total_f16_bytes > 0 ? (float)total_out_bytes / (float)total_f16_bytes : 1.0f; + LOG_INFO(" total output: %.0f MB (%.1fx vs flat f16 / %.0f MB)", + (float)total_out_bytes / (1024.0f * 1024.0f), + ratio, + (float)total_f16_bytes / (1024.0f * 1024.0f)); + LOG_INFO("----------------------------------------------------"); + + // Step 4: write GGUF header (only_meta=true). + LOG_INFO("writing GGUF to %s", output_path.c_str()); + FILE* f = fopen(output_path.c_str(), "wb"); + if (!f) { + gguf_free(gguf_ctx); + ggml_free(meta_ctx); + if (error) *error = "failed to open output file '" + output_path + "'"; + return false; + } + if (!gguf_write_to_file_ptr(gguf_ctx, f, true)) { + fclose(f); + gguf_free(gguf_ctx); + ggml_free(meta_ctx); + if (error) *error = "gguf_write_to_file_ptr (header) failed"; + return false; + } + + // Step 5: streaming quantize + write tensor data, one tensor at a time. + // gguf_get_data_offset returns ctx->offset which is only valid when reading. + // For newly created contexts use gguf_get_meta_size which matches what was just written. + size_t data_start = gguf_get_meta_size(gguf_ctx); + int64_t n_tensors = gguf_get_n_tensors(gguf_ctx); + + bool write_ok = true; + std::vector quant_buf; + + for (int64_t gi = 0; gi < n_tensors && write_ok; gi++) { + const TensorStorage& ts = active[(size_t)gi]; + ggml_type ttype = target_types[(size_t)gi]; + int64_t n = ts.nelements(); + + // Seek to the exact offset GGUF expects for this tensor. + // Gaps are filled with zeros by the OS; no manual padding needed. + size_t expected = data_start + gguf_get_tensor_offset(gguf_ctx, gi); + if (fseeko(f, (off_t)expected, SEEK_SET) != 0) { + if (error) *error = "fseeko failed for tensor '" + ts.name + "'"; + write_ok = false; + break; + } + + if (n == 0) continue; + + if (!model_loader.load_tensor_f32(ts, f32_buf)) { + if (error) *error = "failed to load tensor '" + ts.name + "' in write pass"; + write_ok = false; + break; + } + + const float* data = f32_buf.data(); + int64_t n_per_row = ts.ne[0]; + int64_t nrows = n / n_per_row; + + size_t out_bytes; + const void* write_ptr; + + if (ttype == GGML_TYPE_F32) { + out_bytes = (size_t)n * sizeof(float); + write_ptr = data; + } else if (ttype == GGML_TYPE_F16) { + out_bytes = (size_t)n * sizeof(ggml_fp16_t); + quant_buf.resize(out_bytes); + ggml_fp32_to_fp16_row(data, (ggml_fp16_t*)quant_buf.data(), n); + write_ptr = quant_buf.data(); + } else { + out_bytes = (size_t)nrows * ggml_row_size(ttype, n_per_row); + quant_buf.resize(out_bytes); + std::vector imatrix(n_per_row, 1.0f); + ggml_quantize_chunk(ttype, data, quant_buf.data(), 0, nrows, n_per_row, imatrix.data()); + write_ptr = quant_buf.data(); + } + + if (fwrite(write_ptr, 1, out_bytes, f) != out_bytes) { + if (error) *error = "fwrite failed for tensor '" + ts.name + "'"; + write_ok = false; + break; + } + } + + fclose(f); + gguf_free(gguf_ctx); + ggml_free(meta_ctx); + return write_ok; +} + +// ─── Public entry point ─────────────────────────────────────────────────────── + +bool convert(const sd_ctx_params_t* params, + const char* output_path, + bool convert_name, + float rmse_threshold) { + ModelLoader model_loader; + + if (strlen(SAFE_STR(params->model_path)) > 0) { + if (!model_loader.init_from_file(params->model_path)) { + LOG_ERROR("init model loader from file failed: '%s'", params->model_path); + return false; + } + } + + if (strlen(SAFE_STR(params->diffusion_model_path)) > 0) { + if (!model_loader.init_from_file(params->diffusion_model_path, "model.diffusion_model.")) { + LOG_ERROR("init model loader from file failed: '%s'", params->diffusion_model_path); + return false; + } + } + + bool is_unet = sd_version_is_unet(model_loader.get_sd_version()); + + if (strlen(SAFE_STR(params->clip_l_path)) > 0) { + std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer."; + if (!model_loader.init_from_file(params->clip_l_path, prefix)) { + LOG_ERROR("init model loader from file failed: '%s'", params->clip_l_path); + return false; + } + } + + if (strlen(SAFE_STR(params->clip_g_path)) > 0) { + std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer."; + if (!model_loader.init_from_file(params->clip_g_path, prefix)) { + LOG_ERROR("init model loader from file failed: '%s'", params->clip_g_path); + return false; + } + } + + if (strlen(SAFE_STR(params->t5xxl_path)) > 0) { + if (!model_loader.init_from_file(params->t5xxl_path, "text_encoders.t5xxl.transformer.")) { + LOG_ERROR("init model loader from file failed: '%s'", params->t5xxl_path); + return false; + } + } + + if (strlen(SAFE_STR(params->llm_path)) > 0) { + if (!model_loader.init_from_file(params->llm_path, "text_encoders.llm.")) { + LOG_ERROR("init model loader from file failed: '%s'", params->llm_path); + return false; + } + } + + if (strlen(SAFE_STR(params->llm_vision_path)) > 0) { + if (!model_loader.init_from_file(params->llm_vision_path, "text_encoders.llm.visual.")) { + LOG_ERROR("init model loader from file failed: '%s'", params->llm_vision_path); + return false; + } + } + + if (strlen(SAFE_STR(params->vae_path)) > 0) { + if (!model_loader.init_from_file(params->vae_path, "vae.")) { + LOG_ERROR("init model loader from file failed: '%s'", params->vae_path); + return false; + } + } + + if (convert_name) { + model_loader.convert_tensors_name(); + } + + // When --type is not given and RMSE mode is active, default ceiling to f16. + ggml_type ceiling_type = (params->wtype != SD_TYPE_COUNT) + ? (ggml_type)params->wtype + : (rmse_threshold > 0.0f ? GGML_TYPE_F16 : GGML_TYPE_COUNT); + + bool output_is_safetensors = ends_with(output_path, ".safetensors"); + TensorTypeRules type_rules = parse_tensor_type_rules(SAFE_STR(params->tensor_type_rules)); + + auto backend = ggml_backend_cpu_init(); + bool success = false; + std::string error; + + if (rmse_threshold > 0.0f) { + // ── RMSE path (streaming, low RAM) ──────────────────────────────────── + // Two-pass: type sweep then quantize+write, one tensor in RAM at a time. + ggml_backend_free(backend); + if (output_is_safetensors) { + LOG_ERROR("RMSE streaming mode does not support safetensors output; use .gguf"); + return false; + } + success = convert_rmse_streaming(model_loader, ceiling_type, type_rules, + rmse_threshold, output_path, &error); + } else { + // ── Normal path ──────────────────────────────────────────────────────── + size_t mem_size = 1 * 1024 * 1024; + mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead(); + mem_size += model_loader.get_params_mem_size(backend, ceiling_type); + + ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false}); + if (!ggml_ctx) { + LOG_ERROR("ggml_init failed for converter"); + ggml_backend_free(backend); + return false; + } + + std::vector tensors; + success = load_tensors_for_export(model_loader, ggml_ctx, ceiling_type, type_rules, tensors); + ggml_backend_free(backend); + + if (success) { + if (output_is_safetensors) { + success = write_safetensors_file(output_path, tensors, &error); + } else { + success = write_gguf_file(output_path, tensors, &error); + } + } + + ggml_free(ggml_ctx); + } + + if (!success && !error.empty()) { + LOG_ERROR("%s", error.c_str()); + } + + return success; +} diff --git a/src/model.cpp b/src/model.cpp index 25d78b94e..7eec20667 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -202,6 +202,91 @@ void convert_tensor(void* src, } } +bool ModelLoader::load_tensor_f32(const TensorStorage& ts, std::vector& out) { + if (ts.file_index >= file_paths_.size()) { + LOG_ERROR("load_tensor_f32: invalid file_index %zu for '%s'", ts.file_index, ts.name.c_str()); + return false; + } + + int64_t n = ts.nelements(); + if (n == 0) { + out.clear(); + return true; + } + out.resize(n); + + size_t nbytes_to_read = ts.nbytes_to_read(); + std::vector raw(nbytes_to_read); + const std::string& file_path = file_paths_[ts.file_index]; + + if (ts.index_in_zip >= 0) { + zip_t* z = zip_open(file_path.c_str(), 0, 'r'); + if (!z) { + LOG_ERROR("load_tensor_f32: failed to open zip '%s'", file_path.c_str()); + return false; + } + zip_entry_openbyindex(z, ts.index_in_zip); + size_t entry_size = zip_entry_size(z); + if (entry_size != nbytes_to_read) { + std::vector entry_buf(entry_size); + zip_entry_noallocread(z, entry_buf.data(), entry_size); + memcpy(raw.data(), entry_buf.data() + ts.offset, nbytes_to_read); + } else { + zip_entry_noallocread(z, raw.data(), nbytes_to_read); + } + zip_entry_close(z); + zip_close(z); + } else { + std::ifstream file(file_path, std::ios::binary); + if (!file) { + LOG_ERROR("load_tensor_f32: failed to open '%s'", file_path.c_str()); + return false; + } + file.seekg(ts.offset); + file.read(reinterpret_cast(raw.data()), (std::streamsize)nbytes_to_read); + if (!file) { + LOG_ERROR("load_tensor_f32: read failed for '%s' in '%s'", ts.name.c_str(), file_path.c_str()); + return false; + } + } + + void* src = raw.data(); + ggml_type src_type = ts.type; + std::vector intermediate; + + if (ts.is_f8_e4m3) { + intermediate.resize((size_t)n * sizeof(ggml_fp16_t)); + f8_e4m3_to_f16_vec(raw.data(), (uint16_t*)intermediate.data(), n); + src = intermediate.data(); + src_type = GGML_TYPE_F16; + } else if (ts.is_f8_e5m2) { + intermediate.resize((size_t)n * sizeof(ggml_fp16_t)); + f8_e5m2_to_f16_vec(raw.data(), (uint16_t*)intermediate.data(), n); + src = intermediate.data(); + src_type = GGML_TYPE_F16; + } else if (ts.is_f64) { + f64_to_f32_vec(reinterpret_cast(raw.data()), out.data(), n); + return true; + } else if (ts.is_i64) { + std::vector i32buf(n); + i64_to_i32_vec(reinterpret_cast(raw.data()), i32buf.data(), n); + for (int64_t i = 0; i < n; i++) out[i] = static_cast(i32buf[i]); + return true; + } + + if (src_type == GGML_TYPE_F32) { + memcpy(out.data(), src, (size_t)n * sizeof(float)); + } else if (src_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row(reinterpret_cast(src), out.data(), n); + } else { + int64_t n_per_row = ts.ne[0]; + int64_t nrows = n / n_per_row; + convert_tensor(src, src_type, out.data(), GGML_TYPE_F32, (int)nrows, (int)n_per_row); + } + + return true; +} + /*================================================= ModelLoader ==================================================*/ void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) { diff --git a/src/model.h b/src/model.h index 8ecea16b5..9bbd867df 100644 --- a/src/model.h +++ b/src/model.h @@ -286,6 +286,9 @@ class ModelLoader { bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); + // Load a single tensor as f32 without holding any other tensors in memory. + // Suitable for streaming conversion passes. + bool load_tensor_f32(const TensorStorage& ts, std::vector& out); ~ModelLoader() = default; }; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 9e5a1f755..77ec70d0b 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1,6 +1,10 @@ #include "ggml_extend.hpp" #include "ggml_graph_cut.h" +#if defined(__linux__) +#include +#endif + #include "model.h" #include "rng.hpp" #include "rng_mt19937.hpp" @@ -138,6 +142,7 @@ class StableDiffusionGGML { bool vae_decode_only = false; bool external_vae_is_invalid = false; bool free_params_immediately = false; + bool lazy_loading = false; bool circular_x = false; bool circular_y = false; @@ -208,6 +213,20 @@ class StableDiffusionGGML { return params_backend_for(module) != nullptr; } + // For each mmap-backed tensor whose name starts with `prefix`, advise the OS + // that those physical pages can be reclaimed. The virtual mapping stays + // valid so no pointer patching is needed; pages are reloaded on next access. + void evict_component_from_ram(const std::string& prefix) { +#if defined(__linux__) + for (auto& [name, tensor] : tensors) { + if (!tensor || !tensor->data || tensor->view_src) continue; + if (tensor->buffer != nullptr) continue; // not mmap-backed + if (name.size() < prefix.size() || name.substr(0, prefix.size()) != prefix) continue; + madvise(tensor->data, ggml_nbytes(tensor), MADV_DONTNEED); + } +#endif + } + bool init_backend(const sd_ctx_params_t* sd_ctx_params) { std::string error; if (!backend_manager.init(sd_ctx_params->backend, @@ -237,8 +256,13 @@ class StableDiffusionGGML { n_threads = sd_ctx_params->n_threads; vae_decode_only = sd_ctx_params->vae_decode_only; free_params_immediately = sd_ctx_params->free_params_immediately; + lazy_loading = sd_ctx_params->lazy_loading; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; max_vram = sd_ctx_params->max_vram; + if (lazy_loading) { + free_params_immediately = true; + LOG_INFO("lazy_loading enabled: mmap and free_params_immediately forced on"); + } backend_spec = SAFE_STR(sd_ctx_params->backend); params_backend_spec = SAFE_STR(sd_ctx_params->params_backend); @@ -433,7 +457,7 @@ class StableDiffusionGGML { std::map mmap_able_tensors; bool enable_mmap_tensors = false; bool needs_writable_mmap = false; - if (sd_ctx_params->enable_mmap) { + if (sd_ctx_params->enable_mmap || lazy_loading) { if (apply_lora_immediately) { needs_writable_mmap = true; LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap"); @@ -1014,7 +1038,7 @@ class StableDiffusionGGML { return false; } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap || lazy_loading); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -4019,6 +4043,13 @@ static std::optional prepare_image_generation_embeds(sd_c if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->cond_stage_model->free_params_buffer(); + if (sd_ctx->sd->lazy_loading) { + sd_ctx->sd->cond_stage_model->free_compute_buffer(); + } + } + if (sd_ctx->sd->lazy_loading) { + sd_ctx->sd->evict_component_from_ram("text_encoders."); + LOG_DEBUG("lazy_loading: text encoder pages evicted"); } ImageGenerationEmbeds embeds; @@ -4062,6 +4093,11 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx, LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->first_stage_model->free_params_buffer(); + if (sd_ctx->sd->lazy_loading) { + sd_ctx->sd->evict_component_from_ram("first_stage_model."); // AIO GGUF / main model file + sd_ctx->sd->evict_component_from_ram("vae."); // separate --vae file + LOG_DEBUG("lazy_loading: VAE pages evicted"); + } } sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t)); @@ -4260,6 +4296,10 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s int64_t t0 = ggml_time_ms(); sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; + if (sd_ctx->sd->lazy_loading && !sd_ctx->sd->vae_tiling_params.enabled) { + sd_ctx->sd->vae_tiling_params.enabled = true; + LOG_INFO("lazy_loading: auto-enabling VAE tiling to reduce peak VRAM"); + } GenerationRequest request(sd_ctx, sd_img_gen_params); LOG_INFO("generate_image %dx%d", request.width, request.height); @@ -4345,6 +4385,11 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s } if (sd_ctx->sd->free_params_immediately && !request.hires.enabled) { sd_ctx->sd->diffusion_model->free_params_buffer(); + if (sd_ctx->sd->lazy_loading) { + sd_ctx->sd->diffusion_model->free_compute_buffer(); + sd_ctx->sd->evict_component_from_ram("model.diffusion_model."); + LOG_DEBUG("lazy_loading: diffusion model pages evicted"); + } } int64_t denoise_end = ggml_time_ms(); LOG_INFO("generating %zu latent images completed, taking %.2fs", @@ -4468,6 +4513,10 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s } if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->diffusion_model->free_params_buffer(); + if (sd_ctx->sd->lazy_loading) { + sd_ctx->sd->evict_component_from_ram("model.diffusion_model."); + LOG_DEBUG("lazy_loading: diffusion model pages evicted (post hires)"); + } } int64_t hires_denoise_end = ggml_time_ms(); LOG_INFO("hires fix completed, taking %.2fs", (hires_denoise_end - hires_denoise_start) * 1.0f / 1000);