tool/ex/tests: consistently free ctx, then model

JohannesGaessler · JohannesGaessler · commit 253e20872050 · 2025-12-18T13:49:58.000+01:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
     impl() = default;
     ~impl() = default;
 
+    // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
+
     llama_model_ptr   model;
     llama_context_ptr context;
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -459,23 +459,22 @@ llama_context::llama_context(
 }
 
 llama_context::~llama_context() {
-    // FIXME this currently results in a use-after-free bug if the model is freed before the context
-    // if (!model.hparams.no_alloc) {
-    //     for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-    //         ggml_backend_t             backend = backend_ptrs[i];
-    //         ggml_backend_buffer_type_t buft    = backend_buft[i];
-
-    //         const size_t size_exp = backend_buf_exp_size[i];
-    //         const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
-    //         if (size_exp == size_act) {
-    //             LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
-    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-    //         } else {
-    //             LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
-    //                 __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
-    //         }
-    //     }
-    // }
+    if (!model.hparams.no_alloc) {
+        for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+            ggml_backend_t             backend = backend_ptrs[i];
+            ggml_backend_buffer_type_t buft    = backend_buft[i];
+
+            const size_t size_exp = backend_buf_exp_size[i];
+            const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+            if (size_exp == size_act) {
+                LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+            } else {
+                LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+                    __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+            }
+        }
+    }
     ggml_opt_free(opt_ctx);
 }
 
diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp
@@ -1196,6 +1196,9 @@ int main(int argc, const char ** argv) {
 
     test_sampler_chain();
 
+    llama_free(ctx);
+    llama_model_free(model);
+
     fprintf(stdout, "All tests passed.\n");
     return 0;
 }
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
@@ -146,8 +146,8 @@ int main(int argc, char **argv) {
         }
     }
 
-    llama_model_free(model);
     llama_free(ctx);
+    llama_model_free(model);
 
     llama_backend_free();
 
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
@@ -55,6 +55,7 @@ int main(int argc, char ** argv) {
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        llama_model_free(model);
         return 1;
     }
 
@@ -108,6 +109,8 @@ int main(int argc, char ** argv) {
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
             LOG_ERR("%s: llama_decode() failed\n", __func__);
+            llama_free(ctx);
+            llama_model_free(model);
             return 1;
         }
     }
@@ -147,6 +150,8 @@ int main(int argc, char ** argv) {
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(model);
                     return 1;
                 }
 
@@ -165,6 +170,8 @@ int main(int argc, char ** argv) {
                         common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);
                         if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                             LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
                             return 1;
                         }
                         llama_memory_seq_rm(mem, 0, pp, -1);
@@ -184,6 +191,8 @@ int main(int argc, char ** argv) {
 
                             if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                                 LOG_ERR("%s: llama_decode() failed\n", __func__);
+                                llama_free(ctx);
+                                llama_model_free(model);
                                 return 1;
                             }
                         }
@@ -200,6 +209,8 @@ int main(int argc, char ** argv) {
 
                         if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {
                             LOG_ERR("%s: llama_decode() failed\n", __func__);
+                            llama_free(ctx);
+                            llama_model_free(model);
                             return 1;
                         }
                     }
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
@@ -2102,6 +2102,8 @@ int main(int argc, char ** argv) {
         struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
         if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
             fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            llama_free(ctx);
+            llama_model_free(lmodel);
             exit(1);
         }
         tpp.strict_cpu = t.cpu_strict;
@@ -2111,6 +2113,8 @@ int main(int argc, char ** argv) {
         struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
         if (!threadpool) {
             fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            llama_free(ctx);
+            llama_model_free(lmodel);
             exit(1);
         }
 
@@ -2126,6 +2130,8 @@ int main(int argc, char ** argv) {
                 bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                     exit(1);
                 }
             }
@@ -2136,6 +2142,8 @@ int main(int argc, char ** argv) {
                 bool res = test_gen(ctx, 1, t.n_threads);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                     exit(1);
                 }
             }
@@ -2164,6 +2172,8 @@ int main(int argc, char ** argv) {
                     bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
                     if (!res) {
                         fprintf(stderr, "%s: error: failed to run depth\n", __func__);
+                        llama_free(ctx);
+                        llama_model_free(lmodel);
                         exit(1);
                     }
 
@@ -2189,6 +2199,8 @@ int main(int argc, char ** argv) {
                 bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                     exit(1);
                 }
             }
@@ -2200,6 +2212,8 @@ int main(int argc, char ** argv) {
                 bool res = test_gen(ctx, t.n_gen, t.n_threads);
                 if (!res) {
                     fprintf(stderr, "%s: error: failed to run gen\n", __func__);
+                    llama_free(ctx);
+                    llama_model_free(lmodel);
                     exit(1);
                 }
             }

Original file line number	Diff line number	Diff line change
`@@ -1196,6 +1196,9 @@ int main(int argc, const char ** argv) {`
`1196`	`1196`
`1197`	`1197`	`test_sampler_chain();`
`1198`	`1198`
	`1199`	`+ llama_free(ctx);`
	`1200`	`+ llama_model_free(model);`
	`1201`	`+`
`1199`	`1202`	`fprintf(stdout, "All tests passed.\n");`
`1200`	`1203`	`return 0;`
`1201`	`1204`	`}`
Original file line number	Diff line number	Diff line change
`@@ -146,8 +146,8 @@ int main(int argc, char **argv) {`
`146`	`146`	`}`
`147`	`147`	`}`
`148`	`148`
`149`		`- llama_model_free(model);`
`150`	`149`	`llama_free(ctx);`
	`150`	`+ llama_model_free(model);`
`151`	`151`
`152`	`152`	`llama_backend_free();`
`153`	`153`
Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ int main(int argc, char ** argv) {`
`55`	`55`
`56`	`56`	`if (ctx == NULL) {`
`57`	`57`	`fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);`
	`58`	`+ llama_model_free(model);`
`58`	`59`	`return 1;`
`59`	`60`	`}`
`60`	`61`
`@@ -108,6 +109,8 @@ int main(int argc, char ** argv) {`
`108`	`109`
`109`	`110`	`if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {`
`110`	`111`	`LOG_ERR("%s: llama_decode() failed\n", __func__);`
	`112`	`+ llama_free(ctx);`
	`113`	`+ llama_model_free(model);`
`111`	`114`	`return 1;`
`112`	`115`	`}`
`113`	`116`	`}`
`@@ -147,6 +150,8 @@ int main(int argc, char ** argv) {`
`147`	`150`
`148`	`151`	`if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) {`
`149`	`152`	`LOG_ERR("%s: llama_decode() failed\n", __func__);`
	`153`	`+ llama_free(ctx);`
	`154`	`+ llama_model_free(model);`
`150`	`155`	`return 1;`
`151`	`156`	`}`
`152`	`157`
`@@ -165,6 +170,8 @@ int main(int argc, char ** argv) {`
`165`	`170`	`common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true);`
`166`	`171`	`if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {`
`167`	`172`	`LOG_ERR("%s: llama_decode() failed\n", __func__);`
	`173`	`+ llama_free(ctx);`
	`174`	`+ llama_model_free(model);`
`168`	`175`	`return 1;`
`169`	`176`	`}`
`170`	`177`	`llama_memory_seq_rm(mem, 0, pp, -1);`
`@@ -184,6 +191,8 @@ int main(int argc, char ** argv) {`
`184`	`191`
`185`	`192`	`if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {`
`186`	`193`	`LOG_ERR("%s: llama_decode() failed\n", __func__);`
	`194`	`+ llama_free(ctx);`
	`195`	`+ llama_model_free(model);`
`187`	`196`	`return 1;`
`188`	`197`	`}`
`189`	`198`	`}`
`@@ -200,6 +209,8 @@ int main(int argc, char ** argv) {`
`200`	`209`
`201`	`210`	`if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) {`
`202`	`211`	`LOG_ERR("%s: llama_decode() failed\n", __func__);`
	`212`	`+ llama_free(ctx);`
	`213`	`+ llama_model_free(model);`
`203`	`214`	`return 1;`
`204`	`215`	`}`
`205`	`216`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2102,6 +2102,8 @@ int main(int argc, char ** argv) {`
`2102`	`2102`	`struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);`
`2103`	`2103`	`if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {`
`2104`	`2104`	`fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());`
	`2105`	`+ llama_free(ctx);`
	`2106`	`+ llama_model_free(lmodel);`
`2105`	`2107`	`exit(1);`
`2106`	`2108`	`}`
`2107`	`2109`	`tpp.strict_cpu = t.cpu_strict;`
`@@ -2111,6 +2113,8 @@ int main(int argc, char ** argv) {`
`2111`	`2113`	`struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);`
`2112`	`2114`	`if (!threadpool) {`
`2113`	`2115`	`fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);`
	`2116`	`+ llama_free(ctx);`
	`2117`	`+ llama_model_free(lmodel);`
`2114`	`2118`	`exit(1);`
`2115`	`2119`	`}`
`2116`	`2120`
`@@ -2126,6 +2130,8 @@ int main(int argc, char ** argv) {`
`2126`	`2130`	`bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`2127`	`2131`	`if (!res) {`
`2128`	`2132`	`fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);`
	`2133`	`+ llama_free(ctx);`
	`2134`	`+ llama_model_free(lmodel);`
`2129`	`2135`	`exit(1);`
`2130`	`2136`	`}`
`2131`	`2137`	`}`
`@@ -2136,6 +2142,8 @@ int main(int argc, char ** argv) {`
`2136`	`2142`	`bool res = test_gen(ctx, 1, t.n_threads);`
`2137`	`2143`	`if (!res) {`
`2138`	`2144`	`fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);`
	`2145`	`+ llama_free(ctx);`
	`2146`	`+ llama_model_free(lmodel);`
`2139`	`2147`	`exit(1);`
`2140`	`2148`	`}`
`2141`	`2149`	`}`
`@@ -2164,6 +2172,8 @@ int main(int argc, char ** argv) {`
`2164`	`2172`	`bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);`
`2165`	`2173`	`if (!res) {`
`2166`	`2174`	`fprintf(stderr, "%s: error: failed to run depth\n", __func__);`
	`2175`	`+ llama_free(ctx);`
	`2176`	`+ llama_model_free(lmodel);`
`2167`	`2177`	`exit(1);`
`2168`	`2178`	`}`
`2169`	`2179`
`@@ -2189,6 +2199,8 @@ int main(int argc, char ** argv) {`
`2189`	`2199`	`bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);`
`2190`	`2200`	`if (!res) {`
`2191`	`2201`	`fprintf(stderr, "%s: error: failed to run prompt\n", __func__);`
	`2202`	`+ llama_free(ctx);`
	`2203`	`+ llama_model_free(lmodel);`
`2192`	`2204`	`exit(1);`
`2193`	`2205`	`}`
`2194`	`2206`	`}`
`@@ -2200,6 +2212,8 @@ int main(int argc, char ** argv) {`
`2200`	`2212`	`bool res = test_gen(ctx, t.n_gen, t.n_threads);`
`2201`	`2213`	`if (!res) {`
`2202`	`2214`	`fprintf(stderr, "%s: error: failed to run gen\n", __func__);`
	`2215`	`+ llama_free(ctx);`
	`2216`	`+ llama_model_free(lmodel);`
`2203`	`2217`	`exit(1);`
`2204`	`2218`	`}`
`2205`	`2219`	`}`