llm: Avoid integer underflow on llama engine memory layout

On the llama engine, when we compute the memory layout, we reserve a buffer to allow for some flexibility for incorrect estimates. This is subtracted from GPU free memory and on GPUs with limited memory, it may underflow. Fixes #13494
2026-01-12 00:06:57 +08:00 · 2025-12-19 15:10:55 -08:00
parent 8852220f59
commit 172b5924af
1 changed files with 11 additions and 2 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
 	// Use the size of one layer as a buffer
 	layers := s.ggml.Tensors().GroupLayers()
 	if blk0, ok := layers["blk.0"]; ok {
 		buffer := blk0.Size() + kv[0]
 		for i := range gpus {
-			gpus[i].FreeMemory -= blk0.Size() + kv[0]
+			if gpus[i].FreeMemory > buffer {
 				gpus[i].FreeMemory -= buffer
 			} else {
 				gpus[i].FreeMemory = 0
 			}
 		}
 	} else {
 		slog.Warn("model missing blk.0 layer size")
@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
 			projectorGPU = firstIntegrated
 		}
-		gpus[projectorGPU].FreeMemory -= projectorWeights
+		if gpus[projectorGPU].FreeMemory > projectorWeights {
 			gpus[projectorGPU].FreeMemory -= projectorWeights
 		} else {
 			gpus[projectorGPU].FreeMemory = 0
 		}
 	}
 	var kvTotal uint64