llm: Avoid integer underflow on llama engine memory layout

On the llama engine, when we compute the memory layout, we reserve
a buffer to allow for some flexibility for incorrect estimates.
This is subtracted from GPU free memory and on GPUs with limited
memory, it may underflow.

Fixes #13494
This commit is contained in:
Jesse Gross
2025-12-19 15:10:55 -08:00
committed by Jesse Gross
parent 8852220f59
commit 172b5924af

View File

@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
// Use the size of one layer as a buffer // Use the size of one layer as a buffer
layers := s.ggml.Tensors().GroupLayers() layers := s.ggml.Tensors().GroupLayers()
if blk0, ok := layers["blk.0"]; ok { if blk0, ok := layers["blk.0"]; ok {
buffer := blk0.Size() + kv[0]
for i := range gpus { for i := range gpus {
gpus[i].FreeMemory -= blk0.Size() + kv[0] if gpus[i].FreeMemory > buffer {
gpus[i].FreeMemory -= buffer
} else {
gpus[i].FreeMemory = 0
}
} }
} else { } else {
slog.Warn("model missing blk.0 layer size") slog.Warn("model missing blk.0 layer size")
@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
projectorGPU = firstIntegrated projectorGPU = firstIntegrated
} }
gpus[projectorGPU].FreeMemory -= projectorWeights if gpus[projectorGPU].FreeMemory > projectorWeights {
gpus[projectorGPU].FreeMemory -= projectorWeights
} else {
gpus[projectorGPU].FreeMemory = 0
}
} }
var kvTotal uint64 var kvTotal uint64