From 172b5924af1f08277a7d6133d9bbfd4bd7438f01 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 19 Dec 2025 15:10:55 -0800 Subject: [PATCH] llm: Avoid integer underflow on llama engine memory layout On the llama engine, when we compute the memory layout, we reserve a buffer to allow for some flexibility for incorrect estimates. This is subtracted from GPU free memory and on GPUs with limited memory, it may underflow. Fixes #13494 --- llm/server.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/llm/server.go b/llm/server.go index a89027b06..c83bd5a40 100644 --- a/llm/server.go +++ b/llm/server.go @@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system // Use the size of one layer as a buffer layers := s.ggml.Tensors().GroupLayers() if blk0, ok := layers["blk.0"]; ok { + buffer := blk0.Size() + kv[0] for i := range gpus { - gpus[i].FreeMemory -= blk0.Size() + kv[0] + if gpus[i].FreeMemory > buffer { + gpus[i].FreeMemory -= buffer + } else { + gpus[i].FreeMemory = 0 + } } } else { slog.Warn("model missing blk.0 layer size") @@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system projectorGPU = firstIntegrated } - gpus[projectorGPU].FreeMemory -= projectorWeights + if gpus[projectorGPU].FreeMemory > projectorWeights { + gpus[projectorGPU].FreeMemory -= projectorWeights + } else { + gpus[projectorGPU].FreeMemory = 0 + } } var kvTotal uint64