diff --git a/docs/comparison/orig_cat.png b/docs/comparison/orig_cat.png
new file mode 100644
index 000000000..71c1d2c1e
Binary files /dev/null and b/docs/comparison/orig_cat.png differ
diff --git a/docs/comparison/orig_garden.png b/docs/comparison/orig_garden.png
new file mode 100644
index 000000000..100d91293
Binary files /dev/null and b/docs/comparison/orig_garden.png differ
diff --git a/docs/comparison/orig_phonograph.png b/docs/comparison/orig_phonograph.png
new file mode 100644
index 000000000..5d62e14ce
Binary files /dev/null and b/docs/comparison/orig_phonograph.png differ
diff --git a/docs/comparison/rmse1pct_cat.png b/docs/comparison/rmse1pct_cat.png
new file mode 100644
index 000000000..965bc96e4
Binary files /dev/null and b/docs/comparison/rmse1pct_cat.png differ
diff --git a/docs/comparison/rmse1pct_garden.png b/docs/comparison/rmse1pct_garden.png
new file mode 100644
index 000000000..1a67720a3
Binary files /dev/null and b/docs/comparison/rmse1pct_garden.png differ
diff --git a/docs/comparison/rmse1pct_phonograph.png b/docs/comparison/rmse1pct_phonograph.png
new file mode 100644
index 000000000..955a67f1a
Binary files /dev/null and b/docs/comparison/rmse1pct_phonograph.png differ
diff --git a/docs/comparison/rmse3pct_cat.png b/docs/comparison/rmse3pct_cat.png
new file mode 100644
index 000000000..e5457f59b
Binary files /dev/null and b/docs/comparison/rmse3pct_cat.png differ
diff --git a/docs/comparison/rmse3pct_garden.png b/docs/comparison/rmse3pct_garden.png
new file mode 100644
index 000000000..9851abae8
Binary files /dev/null and b/docs/comparison/rmse3pct_garden.png differ
diff --git a/docs/comparison/rmse3pct_phonograph.png b/docs/comparison/rmse3pct_phonograph.png
new file mode 100644
index 000000000..4cca97011
Binary files /dev/null and b/docs/comparison/rmse3pct_phonograph.png differ
diff --git a/docs/comparison/rmse6pct_cat.png b/docs/comparison/rmse6pct_cat.png
new file mode 100644
index 000000000..ac00b9801
Binary files /dev/null and b/docs/comparison/rmse6pct_cat.png differ
diff --git a/docs/comparison/rmse6pct_garden.png b/docs/comparison/rmse6pct_garden.png
new file mode 100644
index 000000000..489255f5e
Binary files /dev/null and b/docs/comparison/rmse6pct_garden.png differ
diff --git a/docs/comparison/rmse6pct_phonograph.png b/docs/comparison/rmse6pct_phonograph.png
new file mode 100644
index 000000000..a8c5b956e
Binary files /dev/null and b/docs/comparison/rmse6pct_phonograph.png differ
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index dc5013b3e..7689b5d5d 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -41,6 +41,7 @@ struct SDCliParams {
     bool verbose          = false;
     bool canny_preprocess = false;
     bool convert_name     = false;
+    float rmse_threshold  = 0.0f;
 
     preview_t preview_method = PREVIEW_NONE;
     int preview_interval     = 1;
@@ -88,6 +89,16 @@ struct SDCliParams {
              &output_begin_idx},
         };
 
+        options.float_options = {
+            {"",
+             "--rmse",
+             "maximum relative RMSE per tensor for auto mixed-precision quantization in convert mode "
+             "(e.g. 0.03 = 3%). Sweeps from coarsest to finest quant type and picks the coarsest "
+             "that stays under this budget. --type sets the quality ceiling (default: f16). "
+             "Explicit --tensor-type-rules take priority over the RMSE sweep.",
+             &rmse_threshold},
+        };
+
         options.bool_options = {
             {"",
              "--canny",
@@ -601,23 +612,16 @@ int main(int argc, const char* argv[]) {
     LOG_DEBUG("%s", gen_params.to_string().c_str());
 
     if (cli_params.mode == CONVERT) {
-        bool success = convert(ctx_params.model_path.c_str(),
-                               ctx_params.vae_path.c_str(),
+        sd_ctx_params_t sd_params = ctx_params.to_sd_ctx_params_t(false, false, false);
+        bool success              = convert(&sd_params,
                                cli_params.output_path.c_str(),
-                               ctx_params.wtype,
-                               ctx_params.tensor_type_rules.c_str(),
-                               cli_params.convert_name);
+                               cli_params.convert_name,
+                               cli_params.rmse_threshold);
         if (!success) {
-            LOG_ERROR("convert '%s'/'%s' to '%s' failed",
-                      ctx_params.model_path.c_str(),
-                      ctx_params.vae_path.c_str(),
-                      cli_params.output_path.c_str());
+            LOG_ERROR("convert to '%s' failed", cli_params.output_path.c_str());
             return 1;
         } else {
-            LOG_INFO("convert '%s'/'%s' to '%s' success",
-                     ctx_params.model_path.c_str(),
-                     ctx_params.vae_path.c_str(),
-                     cli_params.output_path.c_str());
+            LOG_INFO("convert to '%s' success", cli_params.output_path.c_str());
             return 0;
         }
     }
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 519e8aae6..b2793d4e7 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -430,6 +430,10 @@ ArgOptions SDContextParams::get_options() {
          "--mmap",
          "whether to memory-map model",
          true, &enable_mmap},
+        {"-ll",
+         "--lazy-load",
+         "staged loading: evict text encoders from RAM after encoding, diffusion model after sampling, VAE after decoding (forces --mmap on)",
+         true, &lazy_loading},
         {"",
          "--control-net-cpu",
          "keep controlnet in cpu (for low vram)",
@@ -697,6 +701,7 @@ std::string SDContextParams::to_string() const {
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
+        << "  lazy_loading: " << (lazy_loading ? "true" : "false") << ",\n"
         << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
         << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
         << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
@@ -773,6 +778,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         chroma_t5_mask_pad,
         qwen_image_zero_cond_t,
         max_vram,
+        lazy_loading,
         backend.c_str(),
         params_backend.c_str(),
     };
diff --git a/examples/common/common.h b/examples/common/common.h
index ca367f7ee..061a2c5d5 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -115,6 +115,7 @@ struct SDContextParams {
     std::string backend;
     std::string params_backend;
     bool enable_mmap           = false;
+    bool lazy_loading          = false;
     bool control_net_cpu       = false;
     bool clip_on_cpu           = false;
     bool vae_on_cpu            = false;
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index f8b2c2f59..3d35e03f5 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -213,6 +213,7 @@ typedef struct {
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
     float max_vram;  // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
+    bool lazy_loading;  // staged load: encode text, evict text encoder, load diffusion, evict, load VAE, decode
     const char* backend;
     const char* params_backend;
 } sd_ctx_params_t;
@@ -463,12 +464,10 @@ SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
 
 SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx);
 
-SD_API bool convert(const char* input_path,
-                    const char* vae_path,
+SD_API bool convert(const sd_ctx_params_t* params,
                     const char* output_path,
-                    enum sd_type_t output_type,
-                    const char* tensor_type_rules,
-                    bool convert_name);
+                    bool convert_name,
+                    float rmse_threshold);
 
 SD_API bool preprocess_canny(sd_image_t image,
                              float high_threshold,
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index f08feeef7..44aa5a6db 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -115,6 +115,7 @@ struct Conditioner {
                                               const ConditionerParams& conditioner_params) = 0;
     virtual void alloc_params_buffer()                                                     = 0;
     virtual void free_params_buffer()                                                      = 0;
+    virtual void free_compute_buffer() {}
     virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors)           = 0;
     virtual size_t get_params_buffer_size()                                                = 0;
     virtual void set_max_graph_vram_bytes(size_t max_vram_bytes) {}
@@ -805,6 +806,18 @@ struct SD3CLIPEmbedder : public Conditioner {
         }
     }
 
+    void free_compute_buffer() override {
+        if (clip_l) {
+            clip_l->free_compute_buffer();
+        }
+        if (clip_g) {
+            clip_g->free_compute_buffer();
+        }
+        if (t5) {
+            t5->free_compute_buffer();
+        }
+    }
+
     size_t get_params_buffer_size() override {
         size_t buffer_size = 0;
         if (clip_l) {
diff --git a/src/convert.cpp b/src/convert.cpp
index 7cae8df0f..4057bd7a5 100644
--- a/src/convert.cpp
+++ b/src/convert.cpp
@@ -1,138 +1,548 @@
-#include <cstring>
-#include <mutex>
-#include <regex>
-#include <vector>
-
-#include "model.h"
-#include "model_io/gguf_io.h"
-#include "model_io/safetensors_io.h"
-#include "util.h"
-
-#include "ggml-cpu.h"
-
-static ggml_type get_export_tensor_type(ModelLoader& model_loader,
-                                        const TensorStorage& tensor_storage,
-                                        ggml_type type,
-                                        const TensorTypeRules& tensor_type_rules) {
-    const std::string& name = tensor_storage.name;
-    ggml_type tensor_type   = tensor_storage.type;
-    ggml_type dst_type      = type;
-
-    for (const auto& tensor_type_rule : tensor_type_rules) {
-        std::regex pattern(tensor_type_rule.first);
-        if (std::regex_search(name, pattern)) {
-            dst_type = tensor_type_rule.second;
-            break;
-        }
-    }
-
-    if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) {
-        tensor_type = dst_type;
-    }
-
-    return tensor_type;
-}
-
-static bool load_tensors_for_export(ModelLoader& model_loader,
-                                    ggml_context* ggml_ctx,
-                                    ggml_type type,
-                                    const TensorTypeRules& tensor_type_rules,
-                                    std::vector<TensorWriteInfo>& tensors) {
-    std::mutex tensor_mutex;
-    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-        const std::string& name = tensor_storage.name;
-        ggml_type tensor_type   = get_export_tensor_type(model_loader, tensor_storage, type, tensor_type_rules);
-
-        std::lock_guard<std::mutex> lock(tensor_mutex);
-        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
-        if (tensor == nullptr) {
-            LOG_ERROR("ggml_new_tensor failed");
-            return false;
-        }
-        ggml_set_name(tensor, name.c_str());
-
-        if (!tensor->data) {
-            GGML_ASSERT(ggml_nelements(tensor) == 0);
-            // Avoid crashing writers by setting a dummy pointer for zero-sized tensors.
-            LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str());
-            tensor->data = ggml_get_mem_buffer(ggml_ctx);
-        }
-
-        TensorWriteInfo write_info;
-        write_info.tensor = tensor;
-        write_info.n_dims = tensor_storage.n_dims;
-        for (int i = 0; i < tensor_storage.n_dims; ++i) {
-            write_info.ne[i] = tensor_storage.ne[i];
-        }
-
-        *dst_tensor = tensor;
-        tensors.push_back(std::move(write_info));
-
-        return true;
-    };
-
-    bool success = model_loader.load_tensors(on_new_tensor_cb);
-    LOG_INFO("load tensors done");
-    return success;
-}
-
-bool convert(const char* input_path,
-             const char* vae_path,
-             const char* output_path,
-             sd_type_t output_type,
-             const char* tensor_type_rules,
-             bool convert_name) {
-    ModelLoader model_loader;
-
-    if (!model_loader.init_from_file(input_path)) {
-        LOG_ERROR("init model loader from file failed: '%s'", input_path);
-        return false;
-    }
-
-    if (vae_path != nullptr && strlen(vae_path) > 0) {
-        if (!model_loader.init_from_file(vae_path, "vae.")) {
-            LOG_ERROR("init model loader from file failed: '%s'", vae_path);
-            return false;
-        }
-    }
-    if (convert_name) {
-        model_loader.convert_tensors_name();
-    }
-
-    ggml_type type             = (ggml_type)output_type;
-    bool output_is_safetensors = ends_with(output_path, ".safetensors");
-    TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);
-
-    auto backend    = ggml_backend_cpu_init();
-    size_t mem_size = 1 * 1024 * 1024;  // for padding
-    mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
-    mem_size += model_loader.get_params_mem_size(backend, type);
-    LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
-    ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
-
-    if (ggml_ctx == nullptr) {
-        LOG_ERROR("ggml_init failed for converter");
-        ggml_backend_free(backend);
-        return false;
-    }
-
-    std::vector<TensorWriteInfo> tensors;
-    bool success = load_tensors_for_export(model_loader, ggml_ctx, type, type_rules, tensors);
-    ggml_backend_free(backend);
-
-    std::string error;
-    if (success) {
-        if (output_is_safetensors) {
-            success = write_safetensors_file(output_path, tensors, &error);
-        } else {
-            success = write_gguf_file(output_path, tensors, &error);
-        }
-    }
-
-    if (!success && !error.empty()) {
-        LOG_ERROR("%s", error.c_str());
-    }
-
-    ggml_free(ggml_ctx);
-    return success;
-}
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstring>
+#include <map>
+#include <mutex>
+#include <regex>
+#include <thread>
+#include <vector>
+
+#include "model.h"
+#include "model_io/gguf_io.h"
+#include "model_io/safetensors_io.h"
+#include "stable-diffusion.h"
+#include "util.h"
+
+#include "ggml-cpu.h"
+#include "gguf.h"
+
+#ifndef SAFE_STR
+#define SAFE_STR(s) ((s) ? (s) : "")
+#endif
+
+// Candidate types for RMSE sweep, ordered coarsest to finest.
+// find_best_type_for_rmse stops at the ceiling type, so order matters.
+static const ggml_type RMSE_CANDIDATES[] = {
+    GGML_TYPE_Q2_K,
+    GGML_TYPE_Q3_K,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_F16,
+};
+static const int N_RMSE_CANDIDATES = (int)(sizeof(RMSE_CANDIDATES) / sizeof(RMSE_CANDIDATES[0]));
+
+// Returns ||orig - recon||_2 / ||orig||_2, i.e. relative RMSE.
+static float compute_relative_rmse(const float* orig, const float* recon, int64_t n) {
+    double sum_orig_sq = 0.0, sum_delta_sq = 0.0;
+    for (int64_t i = 0; i < n; i++) {
+        sum_orig_sq += (double)orig[i] * orig[i];
+        double d = (double)orig[i] - (double)recon[i];
+        sum_delta_sq += d * d;
+    }
+    if (sum_orig_sq == 0.0) return 0.0f;
+    return (float)std::sqrt(sum_delta_sq / sum_orig_sq);
+}
+
+// Sweep RMSE_CANDIDATES from coarsest to finest up to ceiling_type.
+// Returns the coarsest type whose relative RMSE <= threshold.
+// If no candidate meets the threshold, returns ceiling_type.
+static ggml_type find_best_type_for_rmse(const float* data,
+                                          int64_t nrows,
+                                          int64_t n_per_row,
+                                          ggml_type ceiling_type,
+                                          float threshold) {
+    int64_t n = nrows * n_per_row;
+    std::vector<uint8_t> quant_buf;
+    std::vector<float> recon(n);
+    std::vector<float> imatrix(n_per_row, 1.0f);
+
+    for (int ci = 0; ci < N_RMSE_CANDIDATES; ci++) {
+        ggml_type ctype = RMSE_CANDIDATES[ci];
+
+        // Skip candidates coarser than the ceiling (shouldn't happen with ordered list,
+        // but guard in case ceiling is before the end of the array).
+        bool at_ceiling = (ctype == ceiling_type);
+
+        if (ggml_is_quantized(ctype) && n_per_row % ggml_blck_size(ctype) != 0) {
+            if (at_ceiling) return ceiling_type;
+            continue;
+        }
+
+        size_t qsize;
+        if (ctype == GGML_TYPE_F16) {
+            qsize = (size_t)n * sizeof(ggml_fp16_t);
+        } else {
+            qsize = (size_t)nrows * ggml_row_size(ctype, n_per_row);
+        }
+        quant_buf.resize(qsize);
+
+        if (ctype == GGML_TYPE_F16) {
+            ggml_fp32_to_fp16_row(data, (ggml_fp16_t*)quant_buf.data(), n);
+            ggml_fp16_to_fp32_row((ggml_fp16_t*)quant_buf.data(), recon.data(), n);
+        } else {
+            const ggml_type_traits* traits = ggml_get_type_traits(ctype);
+            if (traits->to_float == nullptr) {
+                if (at_ceiling) return ceiling_type;
+                continue;
+            }
+            ggml_quantize_chunk(ctype, data, quant_buf.data(), 0, nrows, n_per_row, imatrix.data());
+            traits->to_float(quant_buf.data(), recon.data(), n);
+        }
+
+        float rmse = compute_relative_rmse(data, recon.data(), n);
+        if (rmse <= threshold) return ctype;
+        if (at_ceiling) return ceiling_type;
+    }
+
+    return ceiling_type;
+}
+
+// ─── Normal (non-RMSE) export path ────────────────────────────────────────────
+
+static ggml_type get_export_tensor_type(ModelLoader& model_loader,
+                                        const TensorStorage& tensor_storage,
+                                        ggml_type type,
+                                        const TensorTypeRules& tensor_type_rules) {
+    const std::string& name = tensor_storage.name;
+    ggml_type tensor_type   = tensor_storage.type;
+    ggml_type dst_type      = type;
+
+    for (const auto& rule : tensor_type_rules) {
+        std::regex pattern(rule.first);
+        if (std::regex_search(name, pattern)) {
+            dst_type = rule.second;
+            break;
+        }
+    }
+
+    if (model_loader.tensor_should_be_converted(tensor_storage, dst_type)) {
+        tensor_type = dst_type;
+    }
+
+    return tensor_type;
+}
+
+static bool load_tensors_for_export(ModelLoader& model_loader,
+                                    ggml_context* ggml_ctx,
+                                    ggml_type type,
+                                    const TensorTypeRules& tensor_type_rules,
+                                    std::vector<TensorWriteInfo>& tensors) {
+    std::mutex tensor_mutex;
+    auto on_new_tensor_cb = [&](const TensorStorage& ts, ggml_tensor** dst_tensor) -> bool {
+        ggml_type tensor_type = get_export_tensor_type(model_loader, ts, type, tensor_type_rules);
+
+        std::lock_guard<std::mutex> lock(tensor_mutex);
+        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, ts.n_dims, ts.ne);
+        if (tensor == nullptr) {
+            LOG_ERROR("ggml_new_tensor failed");
+            return false;
+        }
+        ggml_set_name(tensor, ts.name.c_str());
+
+        if (!tensor->data) {
+            GGML_ASSERT(ggml_nelements(tensor) == 0);
+            tensor->data = ggml_get_mem_buffer(ggml_ctx);
+        }
+
+        TensorWriteInfo wi;
+        wi.tensor = tensor;
+        wi.n_dims = ts.n_dims;
+        for (int i = 0; i < ts.n_dims; ++i) wi.ne[i] = ts.ne[i];
+
+        *dst_tensor = tensor;
+        tensors.push_back(std::move(wi));
+        return true;
+    };
+
+    bool success = model_loader.load_tensors(on_new_tensor_cb);
+    LOG_INFO("load tensors done");
+    return success;
+}
+
+// ─── RMSE export path (streaming two-pass, low RAM) ──────────────────────────
+//
+// Pass 1: enumerate active tensors (null-dst callback, no data loaded), then
+//         for each tensor: load as f32 → RMSE sweep → record target type → free.
+//         Peak RAM = f32 size of the single largest tensor.
+//
+// Pass 2: write GGUF header (only_meta), then for each tensor: load as f32 →
+//         quantize → write bytes at the correct file offset → free.
+//         Same tiny peak RAM.
+
+static ggml_type pick_target_type(const TensorStorage& ts,
+                                   ModelLoader& model_loader,
+                                   const TensorTypeRules& tensor_type_rules,
+                                   ggml_type ceiling_type,
+                                   float rmse_threshold,
+                                   const float* data,
+                                   int64_t n_per_row,
+                                   int64_t nrows) {
+    for (const auto& rule : tensor_type_rules) {
+        std::regex pattern(rule.first);
+        if (std::regex_search(ts.name, pattern)) return rule.second;
+    }
+
+    if (ceiling_type != GGML_TYPE_COUNT &&
+        model_loader.tensor_should_be_converted(ts, ceiling_type)) {
+        if (data != nullptr) {
+            return find_best_type_for_rmse(data, nrows, n_per_row, ceiling_type, rmse_threshold);
+        }
+        return ceiling_type;
+    }
+    return GGML_TYPE_F16;
+}
+
+static bool convert_rmse_streaming(ModelLoader& model_loader,
+                                    ggml_type ceiling_type,
+                                    const TensorTypeRules& tensor_type_rules,
+                                    float rmse_threshold,
+                                    const std::string& output_path,
+                                    std::string* error) {
+    // Step 1: collect active tensor list without loading any data.
+    // load_tensors uses a thread pool, so the callback is called concurrently.
+    std::vector<TensorStorage> active;
+    {
+        std::mutex active_mtx;
+        auto cb = [&](const TensorStorage& ts, ggml_tensor** dst) -> bool {
+            std::lock_guard<std::mutex> lk(active_mtx);
+            active.push_back(ts);
+            *dst = nullptr;
+            return true;
+        };
+        if (!model_loader.load_tensors(cb)) {
+            if (error) *error = "failed to enumerate tensors";
+            return false;
+        }
+    }
+    LOG_INFO("RMSE sweep: %zu active tensors", active.size());
+
+    // Step 2: type sweep — one tensor in RAM at a time.
+    std::vector<ggml_type> target_types(active.size(), GGML_TYPE_F16);
+    std::vector<float> f32_buf;
+
+    int n_threads  = std::max(1, (int)sd_get_num_physical_cores());
+    std::atomic<size_t> next_idx{0};
+    std::mutex type_write_mutex;
+    std::vector<std::thread> workers;
+    workers.reserve(n_threads);
+
+    // Worker threads each load their own tensor independently (different file offsets).
+    auto worker_fn = [&]() {
+        std::vector<float> local_buf;
+        size_t i;
+        while ((i = next_idx.fetch_add(1, std::memory_order_relaxed)) < active.size()) {
+            const TensorStorage& ts = active[i];
+            int64_t n          = ts.nelements();
+            int64_t n_per_row  = ts.ne[0];
+            int64_t nrows      = n / std::max(n_per_row, (int64_t)1);
+
+            ggml_type ttype;
+            if (n == 0 || !model_loader.tensor_should_be_converted(ts, ceiling_type)) {
+                // Skip RMSE sweep for non-weight tensors; pick_target_type handles rules/F16.
+                ttype = pick_target_type(ts, model_loader, tensor_type_rules,
+                                         ceiling_type, rmse_threshold, nullptr, n_per_row, nrows);
+            } else {
+                if (!model_loader.load_tensor_f32(ts, local_buf)) {
+                    LOG_WARN("RMSE sweep: failed to load '%s', defaulting to f16", ts.name.c_str());
+                    ttype = GGML_TYPE_F16;
+                } else {
+                    ttype = pick_target_type(ts, model_loader, tensor_type_rules,
+                                             ceiling_type, rmse_threshold,
+                                             local_buf.data(), n_per_row, nrows);
+                }
+            }
+            std::lock_guard<std::mutex> lk(type_write_mutex);
+            target_types[i] = ttype;
+        }
+    };
+
+    for (int t = 0; t < n_threads; t++) workers.emplace_back(worker_fn);
+    for (auto& w : workers) w.join();
+    LOG_INFO("RMSE sweep: type selection done on %d threads", n_threads);
+
+    // Step 3: build GGUF header (no_alloc — metadata only, no tensor data in RAM).
+    size_t meta_mem = 1 * 1024 * 1024 + active.size() * ggml_tensor_overhead();
+    ggml_context* meta_ctx = ggml_init({meta_mem, nullptr, true});
+    if (!meta_ctx) {
+        if (error) *error = "ggml_init failed for meta context";
+        return false;
+    }
+
+    gguf_context* gguf_ctx = gguf_init_empty();
+    if (!gguf_ctx) {
+        ggml_free(meta_ctx);
+        if (error) *error = "gguf_init_empty failed";
+        return false;
+    }
+
+    std::map<ggml_type, int>    type_tensor_count;
+    std::map<ggml_type, size_t> type_byte_size;
+    size_t total_f16_bytes = 0;
+
+    for (size_t i = 0; i < active.size(); i++) {
+        const TensorStorage& ts = active[i];
+        ggml_type ttype         = target_types[i];
+        int64_t n               = ts.nelements();
+
+        ggml_tensor* t = ggml_new_tensor(meta_ctx, ttype, ts.n_dims, ts.ne);
+        if (!t) {
+            gguf_free(gguf_ctx);
+            ggml_free(meta_ctx);
+            if (error) *error = "ggml_new_tensor failed for '" + ts.name + "'";
+            return false;
+        }
+        ggml_set_name(t, ts.name.c_str());
+        gguf_add_tensor(gguf_ctx, t);
+
+        if (n > 0) {
+            total_f16_bytes += (size_t)n * sizeof(ggml_fp16_t);
+            size_t out_bytes;
+            if (ttype == GGML_TYPE_F32) {
+                out_bytes = (size_t)n * sizeof(float);
+            } else if (ttype == GGML_TYPE_F16) {
+                out_bytes = (size_t)n * sizeof(ggml_fp16_t);
+            } else {
+                int64_t n_per_row = ts.ne[0];
+                int64_t nrows     = n / n_per_row;
+                out_bytes         = (size_t)nrows * ggml_row_size(ttype, n_per_row);
+            }
+            type_tensor_count[ttype]++;
+            type_byte_size[ttype] += out_bytes;
+        }
+    }
+
+    // Print summary table.
+    size_t total_out_bytes = 0;
+    for (auto& [t, b] : type_byte_size) total_out_bytes += b;
+    LOG_INFO("---- RMSE mixed-quant summary (threshold %.1f%%) ----", rmse_threshold * 100.0f);
+    LOG_INFO("  %-12s  %8s  %10s  %6s", "type", "tensors", "size (MB)", "share");
+    for (auto& [t, count] : type_tensor_count) {
+        size_t mb  = type_byte_size[t] / (1024 * 1024);
+        float  pct = total_out_bytes > 0
+                         ? (float)type_byte_size[t] * 100.0f / (float)total_out_bytes
+                         : 0.0f;
+        LOG_INFO("  %-12s  %8d  %10zu  %5.1f%%", ggml_type_name(t), count, mb, pct);
+    }
+    float ratio = total_f16_bytes > 0 ? (float)total_out_bytes / (float)total_f16_bytes : 1.0f;
+    LOG_INFO("  total output: %.0f MB  (%.1fx vs flat f16 / %.0f MB)",
+             (float)total_out_bytes / (1024.0f * 1024.0f),
+             ratio,
+             (float)total_f16_bytes / (1024.0f * 1024.0f));
+    LOG_INFO("----------------------------------------------------");
+
+    // Step 4: write GGUF header (only_meta=true).
+    LOG_INFO("writing GGUF to %s", output_path.c_str());
+    FILE* f = fopen(output_path.c_str(), "wb");
+    if (!f) {
+        gguf_free(gguf_ctx);
+        ggml_free(meta_ctx);
+        if (error) *error = "failed to open output file '" + output_path + "'";
+        return false;
+    }
+    if (!gguf_write_to_file_ptr(gguf_ctx, f, true)) {
+        fclose(f);
+        gguf_free(gguf_ctx);
+        ggml_free(meta_ctx);
+        if (error) *error = "gguf_write_to_file_ptr (header) failed";
+        return false;
+    }
+
+    // Step 5: streaming quantize + write tensor data, one tensor at a time.
+    // gguf_get_data_offset returns ctx->offset which is only valid when reading.
+    // For newly created contexts use gguf_get_meta_size which matches what was just written.
+    size_t data_start = gguf_get_meta_size(gguf_ctx);
+    int64_t n_tensors = gguf_get_n_tensors(gguf_ctx);
+
+    bool write_ok = true;
+    std::vector<uint8_t> quant_buf;
+
+    for (int64_t gi = 0; gi < n_tensors && write_ok; gi++) {
+        const TensorStorage& ts = active[(size_t)gi];
+        ggml_type ttype         = target_types[(size_t)gi];
+        int64_t n               = ts.nelements();
+
+        // Seek to the exact offset GGUF expects for this tensor.
+        // Gaps are filled with zeros by the OS; no manual padding needed.
+        size_t expected = data_start + gguf_get_tensor_offset(gguf_ctx, gi);
+        if (fseeko(f, (off_t)expected, SEEK_SET) != 0) {
+            if (error) *error = "fseeko failed for tensor '" + ts.name + "'";
+            write_ok = false;
+            break;
+        }
+
+        if (n == 0) continue;
+
+        if (!model_loader.load_tensor_f32(ts, f32_buf)) {
+            if (error) *error = "failed to load tensor '" + ts.name + "' in write pass";
+            write_ok = false;
+            break;
+        }
+
+        const float* data = f32_buf.data();
+        int64_t n_per_row = ts.ne[0];
+        int64_t nrows     = n / n_per_row;
+
+        size_t out_bytes;
+        const void* write_ptr;
+
+        if (ttype == GGML_TYPE_F32) {
+            out_bytes = (size_t)n * sizeof(float);
+            write_ptr = data;
+        } else if (ttype == GGML_TYPE_F16) {
+            out_bytes = (size_t)n * sizeof(ggml_fp16_t);
+            quant_buf.resize(out_bytes);
+            ggml_fp32_to_fp16_row(data, (ggml_fp16_t*)quant_buf.data(), n);
+            write_ptr = quant_buf.data();
+        } else {
+            out_bytes = (size_t)nrows * ggml_row_size(ttype, n_per_row);
+            quant_buf.resize(out_bytes);
+            std::vector<float> imatrix(n_per_row, 1.0f);
+            ggml_quantize_chunk(ttype, data, quant_buf.data(), 0, nrows, n_per_row, imatrix.data());
+            write_ptr = quant_buf.data();
+        }
+
+        if (fwrite(write_ptr, 1, out_bytes, f) != out_bytes) {
+            if (error) *error = "fwrite failed for tensor '" + ts.name + "'";
+            write_ok = false;
+            break;
+        }
+    }
+
+    fclose(f);
+    gguf_free(gguf_ctx);
+    ggml_free(meta_ctx);
+    return write_ok;
+}
+
+// ─── Public entry point ───────────────────────────────────────────────────────
+
+bool convert(const sd_ctx_params_t* params,
+             const char* output_path,
+             bool convert_name,
+             float rmse_threshold) {
+    ModelLoader model_loader;
+
+    if (strlen(SAFE_STR(params->model_path)) > 0) {
+        if (!model_loader.init_from_file(params->model_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->model_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->diffusion_model_path)) > 0) {
+        if (!model_loader.init_from_file(params->diffusion_model_path, "model.diffusion_model.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->diffusion_model_path);
+            return false;
+        }
+    }
+
+    bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
+
+    if (strlen(SAFE_STR(params->clip_l_path)) > 0) {
+        std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.";
+        if (!model_loader.init_from_file(params->clip_l_path, prefix)) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->clip_l_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->clip_g_path)) > 0) {
+        std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.";
+        if (!model_loader.init_from_file(params->clip_g_path, prefix)) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->clip_g_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->t5xxl_path)) > 0) {
+        if (!model_loader.init_from_file(params->t5xxl_path, "text_encoders.t5xxl.transformer.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->t5xxl_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->llm_path)) > 0) {
+        if (!model_loader.init_from_file(params->llm_path, "text_encoders.llm.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->llm_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->llm_vision_path)) > 0) {
+        if (!model_loader.init_from_file(params->llm_vision_path, "text_encoders.llm.visual.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->llm_vision_path);
+            return false;
+        }
+    }
+
+    if (strlen(SAFE_STR(params->vae_path)) > 0) {
+        if (!model_loader.init_from_file(params->vae_path, "vae.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", params->vae_path);
+            return false;
+        }
+    }
+
+    if (convert_name) {
+        model_loader.convert_tensors_name();
+    }
+
+    // When --type is not given and RMSE mode is active, default ceiling to f16.
+    ggml_type ceiling_type = (params->wtype != SD_TYPE_COUNT)
+                                 ? (ggml_type)params->wtype
+                                 : (rmse_threshold > 0.0f ? GGML_TYPE_F16 : GGML_TYPE_COUNT);
+
+    bool output_is_safetensors = ends_with(output_path, ".safetensors");
+    TensorTypeRules type_rules = parse_tensor_type_rules(SAFE_STR(params->tensor_type_rules));
+
+    auto backend = ggml_backend_cpu_init();
+    bool success = false;
+    std::string error;
+
+    if (rmse_threshold > 0.0f) {
+        // ── RMSE path (streaming, low RAM) ────────────────────────────────────
+        // Two-pass: type sweep then quantize+write, one tensor in RAM at a time.
+        ggml_backend_free(backend);
+        if (output_is_safetensors) {
+            LOG_ERROR("RMSE streaming mode does not support safetensors output; use .gguf");
+            return false;
+        }
+        success = convert_rmse_streaming(model_loader, ceiling_type, type_rules,
+                                          rmse_threshold, output_path, &error);
+    } else {
+        // ── Normal path ────────────────────────────────────────────────────────
+        size_t mem_size = 1 * 1024 * 1024;
+        mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
+        mem_size += model_loader.get_params_mem_size(backend, ceiling_type);
+
+        ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
+        if (!ggml_ctx) {
+            LOG_ERROR("ggml_init failed for converter");
+            ggml_backend_free(backend);
+            return false;
+        }
+
+        std::vector<TensorWriteInfo> tensors;
+        success = load_tensors_for_export(model_loader, ggml_ctx, ceiling_type, type_rules, tensors);
+        ggml_backend_free(backend);
+
+        if (success) {
+            if (output_is_safetensors) {
+                success = write_safetensors_file(output_path, tensors, &error);
+            } else {
+                success = write_gguf_file(output_path, tensors, &error);
+            }
+        }
+
+        ggml_free(ggml_ctx);
+    }
+
+    if (!success && !error.empty()) {
+        LOG_ERROR("%s", error.c_str());
+    }
+
+    return success;
+}
diff --git a/src/model.cpp b/src/model.cpp
index 25d78b94e..7eec20667 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -202,6 +202,91 @@ void convert_tensor(void* src,
     }
 }
 
+bool ModelLoader::load_tensor_f32(const TensorStorage& ts, std::vector<float>& out) {
+    if (ts.file_index >= file_paths_.size()) {
+        LOG_ERROR("load_tensor_f32: invalid file_index %zu for '%s'", ts.file_index, ts.name.c_str());
+        return false;
+    }
+
+    int64_t n = ts.nelements();
+    if (n == 0) {
+        out.clear();
+        return true;
+    }
+    out.resize(n);
+
+    size_t nbytes_to_read              = ts.nbytes_to_read();
+    std::vector<uint8_t> raw(nbytes_to_read);
+    const std::string& file_path = file_paths_[ts.file_index];
+
+    if (ts.index_in_zip >= 0) {
+        zip_t* z = zip_open(file_path.c_str(), 0, 'r');
+        if (!z) {
+            LOG_ERROR("load_tensor_f32: failed to open zip '%s'", file_path.c_str());
+            return false;
+        }
+        zip_entry_openbyindex(z, ts.index_in_zip);
+        size_t entry_size = zip_entry_size(z);
+        if (entry_size != nbytes_to_read) {
+            std::vector<uint8_t> entry_buf(entry_size);
+            zip_entry_noallocread(z, entry_buf.data(), entry_size);
+            memcpy(raw.data(), entry_buf.data() + ts.offset, nbytes_to_read);
+        } else {
+            zip_entry_noallocread(z, raw.data(), nbytes_to_read);
+        }
+        zip_entry_close(z);
+        zip_close(z);
+    } else {
+        std::ifstream file(file_path, std::ios::binary);
+        if (!file) {
+            LOG_ERROR("load_tensor_f32: failed to open '%s'", file_path.c_str());
+            return false;
+        }
+        file.seekg(ts.offset);
+        file.read(reinterpret_cast<char*>(raw.data()), (std::streamsize)nbytes_to_read);
+        if (!file) {
+            LOG_ERROR("load_tensor_f32: read failed for '%s' in '%s'", ts.name.c_str(), file_path.c_str());
+            return false;
+        }
+    }
+
+    void* src          = raw.data();
+    ggml_type src_type = ts.type;
+    std::vector<uint8_t> intermediate;
+
+    if (ts.is_f8_e4m3) {
+        intermediate.resize((size_t)n * sizeof(ggml_fp16_t));
+        f8_e4m3_to_f16_vec(raw.data(), (uint16_t*)intermediate.data(), n);
+        src      = intermediate.data();
+        src_type = GGML_TYPE_F16;
+    } else if (ts.is_f8_e5m2) {
+        intermediate.resize((size_t)n * sizeof(ggml_fp16_t));
+        f8_e5m2_to_f16_vec(raw.data(), (uint16_t*)intermediate.data(), n);
+        src      = intermediate.data();
+        src_type = GGML_TYPE_F16;
+    } else if (ts.is_f64) {
+        f64_to_f32_vec(reinterpret_cast<double*>(raw.data()), out.data(), n);
+        return true;
+    } else if (ts.is_i64) {
+        std::vector<int32_t> i32buf(n);
+        i64_to_i32_vec(reinterpret_cast<int64_t*>(raw.data()), i32buf.data(), n);
+        for (int64_t i = 0; i < n; i++) out[i] = static_cast<float>(i32buf[i]);
+        return true;
+    }
+
+    if (src_type == GGML_TYPE_F32) {
+        memcpy(out.data(), src, (size_t)n * sizeof(float));
+    } else if (src_type == GGML_TYPE_F16) {
+        ggml_fp16_to_fp32_row(reinterpret_cast<const ggml_fp16_t*>(src), out.data(), n);
+    } else {
+        int64_t n_per_row = ts.ne[0];
+        int64_t nrows     = n / n_per_row;
+        convert_tensor(src, src_type, out.data(), GGML_TYPE_F32, (int)nrows, (int)n_per_row);
+    }
+
+    return true;
+}
+
 /*================================================= ModelLoader ==================================================*/
 
 void ModelLoader::add_tensor_storage(const TensorStorage& tensor_storage) {
diff --git a/src/model.h b/src/model.h
index 8ecea16b5..9bbd867df 100644
--- a/src/model.h
+++ b/src/model.h
@@ -286,6 +286,9 @@ class ModelLoader {
 
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
+    // Load a single tensor as f32 without holding any other tensors in memory.
+    // Suitable for streaming conversion passes.
+    bool load_tensor_f32(const TensorStorage& ts, std::vector<float>& out);
     ~ModelLoader() = default;
 };
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 9e5a1f755..77ec70d0b 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1,6 +1,10 @@
 #include "ggml_extend.hpp"
 #include "ggml_graph_cut.h"
 
+#if defined(__linux__)
+#include <sys/mman.h>
+#endif
+
 #include "model.h"
 #include "rng.hpp"
 #include "rng_mt19937.hpp"
@@ -138,6 +142,7 @@ class StableDiffusionGGML {
     bool vae_decode_only         = false;
     bool external_vae_is_invalid = false;
     bool free_params_immediately = false;
+    bool lazy_loading            = false;
 
     bool circular_x = false;
     bool circular_y = false;
@@ -208,6 +213,20 @@ class StableDiffusionGGML {
         return params_backend_for(module) != nullptr;
     }
 
+    // For each mmap-backed tensor whose name starts with `prefix`, advise the OS
+    // that those physical pages can be reclaimed.  The virtual mapping stays
+    // valid so no pointer patching is needed; pages are reloaded on next access.
+    void evict_component_from_ram(const std::string& prefix) {
+#if defined(__linux__)
+        for (auto& [name, tensor] : tensors) {
+            if (!tensor || !tensor->data || tensor->view_src) continue;
+            if (tensor->buffer != nullptr) continue;  // not mmap-backed
+            if (name.size() < prefix.size() || name.substr(0, prefix.size()) != prefix) continue;
+            madvise(tensor->data, ggml_nbytes(tensor), MADV_DONTNEED);
+        }
+#endif
+    }
+
     bool init_backend(const sd_ctx_params_t* sd_ctx_params) {
         std::string error;
         if (!backend_manager.init(sd_ctx_params->backend,
@@ -237,8 +256,13 @@ class StableDiffusionGGML {
         n_threads               = sd_ctx_params->n_threads;
         vae_decode_only         = sd_ctx_params->vae_decode_only;
         free_params_immediately = sd_ctx_params->free_params_immediately;
+        lazy_loading            = sd_ctx_params->lazy_loading;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
         max_vram                = sd_ctx_params->max_vram;
+        if (lazy_loading) {
+            free_params_immediately = true;
+            LOG_INFO("lazy_loading enabled: mmap and free_params_immediately forced on");
+        }
         backend_spec            = SAFE_STR(sd_ctx_params->backend);
         params_backend_spec     = SAFE_STR(sd_ctx_params->params_backend);
 
@@ -433,7 +457,7 @@ class StableDiffusionGGML {
         std::map<std::string, ggml_tensor*> mmap_able_tensors;
         bool enable_mmap_tensors = false;
         bool needs_writable_mmap = false;
-        if (sd_ctx_params->enable_mmap) {
+        if (sd_ctx_params->enable_mmap || lazy_loading) {
             if (apply_lora_immediately) {
                 needs_writable_mmap = true;
                 LOG_WARN("in mode 'immediately', LoRAs will cause extra memory usage with mmap");
@@ -1014,7 +1038,7 @@ class StableDiffusionGGML {
             return false;
         }
 
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap || lazy_loading);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -4019,6 +4043,13 @@ static std::optional<ImageGenerationEmbeds> prepare_image_generation_embeds(sd_c
 
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->cond_stage_model->free_params_buffer();
+        if (sd_ctx->sd->lazy_loading) {
+            sd_ctx->sd->cond_stage_model->free_compute_buffer();
+        }
+    }
+    if (sd_ctx->sd->lazy_loading) {
+        sd_ctx->sd->evict_component_from_ram("text_encoders.");
+        LOG_DEBUG("lazy_loading: text encoder pages evicted");
     }
 
     ImageGenerationEmbeds embeds;
@@ -4062,6 +4093,11 @@ static sd_image_t* decode_image_outputs(sd_ctx_t* sd_ctx,
     LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t0) * 1.0f / 1000);
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->first_stage_model->free_params_buffer();
+        if (sd_ctx->sd->lazy_loading) {
+            sd_ctx->sd->evict_component_from_ram("first_stage_model."); // AIO GGUF / main model file
+            sd_ctx->sd->evict_component_from_ram("vae.");               // separate --vae file
+            LOG_DEBUG("lazy_loading: VAE pages evicted");
+        }
     }
 
     sd_image_t* result_images = (sd_image_t*)calloc(request.batch_count, sizeof(sd_image_t));
@@ -4260,6 +4296,10 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
 
     int64_t t0                    = ggml_time_ms();
     sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
+    if (sd_ctx->sd->lazy_loading && !sd_ctx->sd->vae_tiling_params.enabled) {
+        sd_ctx->sd->vae_tiling_params.enabled = true;
+        LOG_INFO("lazy_loading: auto-enabling VAE tiling to reduce peak VRAM");
+    }
     GenerationRequest request(sd_ctx, sd_img_gen_params);
     LOG_INFO("generate_image %dx%d", request.width, request.height);
 
@@ -4345,6 +4385,11 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
     }
     if (sd_ctx->sd->free_params_immediately && !request.hires.enabled) {
         sd_ctx->sd->diffusion_model->free_params_buffer();
+        if (sd_ctx->sd->lazy_loading) {
+            sd_ctx->sd->diffusion_model->free_compute_buffer();
+            sd_ctx->sd->evict_component_from_ram("model.diffusion_model.");
+            LOG_DEBUG("lazy_loading: diffusion model pages evicted");
+        }
     }
     int64_t denoise_end = ggml_time_ms();
     LOG_INFO("generating %zu latent images completed, taking %.2fs",
@@ -4468,6 +4513,10 @@ SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* s
         }
         if (sd_ctx->sd->free_params_immediately) {
             sd_ctx->sd->diffusion_model->free_params_buffer();
+            if (sd_ctx->sd->lazy_loading) {
+                sd_ctx->sd->evict_component_from_ram("model.diffusion_model.");
+                LOG_DEBUG("lazy_loading: diffusion model pages evicted (post hires)");
+            }
         }
         int64_t hires_denoise_end = ggml_time_ms();
         LOG_INFO("hires fix completed, taking %.2fs", (hires_denoise_end - hires_denoise_start) * 1.0f / 1000);