mirror of
https://github.com/ollama/ollama.git
synced 2026-01-12 00:06:57 +08:00
* WIP - MLX backend with gemma3 * MLX: add cmake and go tag build toggles To build the new MLX backend code: cmake --preset MLX cmake --build --preset MLX --parallel cmake --install build --component MLX go build -tags mlx . Note: the main.go entrypoint for the MLX engine will change in a follow up commit. * add experimental image generation runtime * add experimental image generation runtime * MLX: wire up cuda build for linux * MLX: get dependencies correct and dedup This is still too large for a unified github artifact, but is now "correct" for the mlx_cuda_v13 directory. * fix relative link bug in dedup * Add darwin build and readme * add go build tag for mlx dependent code and wire up build_darwin.sh * lint cleanup * macos: build mlx for x86 This will be CPU only. * cuda build instructions and fix drift from mlx bump * stale comment * Delete agent helper doc * Clean up readme.md * Revise README for tokenizer clarity and details Updated README to clarify tokenizer functionality and removed correctness section. --------- Co-authored-by: jmorganca <jmorganca@gmail.com>
476 lines
14 KiB
Go
476 lines
14 KiB
Go
//go:build mlx
|
|
|
|
package qwen_image_edit
|
|
|
|
import (
|
|
"fmt"
|
|
"image"
|
|
"image/color"
|
|
_ "image/jpeg"
|
|
_ "image/png"
|
|
"math"
|
|
"os"
|
|
|
|
"github.com/ollama/ollama/x/imagegen/mlx"
|
|
"golang.org/x/image/draw"
|
|
_ "golang.org/x/image/webp"
|
|
)
|
|
|
|
// loadImageFile loads an image from disk
|
|
func loadImageFile(path string) (image.Image, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open image: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
img, _, err := image.Decode(f)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("decode image: %w", err)
|
|
}
|
|
return img, nil
|
|
}
|
|
|
|
// imageToFloat32Pixels converts an image to a float32 pixel array [H, W, C] in [0, 1] range
|
|
func imageToFloat32Pixels(img image.Image, width, height int) []float32 {
|
|
pixels := make([]float32, width*height*3)
|
|
idx := 0
|
|
for y := 0; y < height; y++ {
|
|
for x := 0; x < width; x++ {
|
|
r, g, b, _ := img.At(x, y).RGBA()
|
|
pixels[idx] = float32(r) / 65535.0
|
|
pixels[idx+1] = float32(g) / 65535.0
|
|
pixels[idx+2] = float32(b) / 65535.0
|
|
idx += 3
|
|
}
|
|
}
|
|
return pixels
|
|
}
|
|
|
|
// normalizeImageNet applies ImageNet normalization to an image tensor
|
|
func (p *Processor) normalizeImageNet(arr *mlx.Array) *mlx.Array {
|
|
mean := mlx.NewArray(p.Config.ImageMean, []int32{1, 1, 3})
|
|
std := mlx.NewArray(p.Config.ImageStd, []int32{1, 1, 3})
|
|
return mlx.Div(mlx.Sub(arr, mean), std)
|
|
}
|
|
|
|
// prepareImageTensor transforms [H, W, C] to [B, C, H, W] and converts to bf16
|
|
func prepareImageTensor(arr *mlx.Array) *mlx.Array {
|
|
// Transpose to [C, H, W] and make contiguous
|
|
arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))
|
|
// Add batch dimension [1, C, H, W]
|
|
arr = mlx.ExpandDims(arr, 0)
|
|
// Convert to bf16
|
|
arr = mlx.ToBFloat16(arr)
|
|
mlx.Eval(arr)
|
|
return arr
|
|
}
|
|
|
|
// clampFloat clamps a value to [0, 255] and returns uint8
|
|
func clampFloat(v, weightSum float64) uint8 {
|
|
v /= weightSum
|
|
if v < 0 {
|
|
v = 0
|
|
}
|
|
if v > 255 {
|
|
v = 255
|
|
}
|
|
return uint8(math.Round(v))
|
|
}
|
|
|
|
// ImageDims holds dimensions for a preprocessed image
|
|
type ImageDims struct {
|
|
// Original image dimensions
|
|
OrigW, OrigH int32
|
|
// Condition image dimensions (for vision encoder)
|
|
CondW, CondH int32
|
|
// VAE image dimensions
|
|
VaeW, VaeH int32
|
|
// Latent dimensions (VAE dims / vae_scale_factor)
|
|
LatentW, LatentH int32
|
|
// Patch dimensions (latent dims / patch_size)
|
|
PatchW, PatchH int32
|
|
}
|
|
|
|
// ProcessorConfig holds image processor configuration
|
|
type ProcessorConfig struct {
|
|
// Condition image size (target pixel area for vision encoder input)
|
|
// Python: CONDITION_IMAGE_SIZE = 384 * 384 = 147456
|
|
// Pipeline resizes image to this area before passing to encode_prompt
|
|
ConditionImageSize int32
|
|
|
|
// VAE image size (target pixel area)
|
|
// Python: VAE_IMAGE_SIZE = 1024 * 1024 = 1048576
|
|
VAEImageSize int32
|
|
|
|
// Image normalization (ImageNet stats for vision encoder)
|
|
ImageMean []float32
|
|
ImageStd []float32
|
|
}
|
|
|
|
// defaultProcessorConfig returns default processor config
|
|
func defaultProcessorConfig() *ProcessorConfig {
|
|
return &ProcessorConfig{
|
|
ConditionImageSize: 384 * 384, // 147456 - matches Python CONDITION_IMAGE_SIZE
|
|
VAEImageSize: 1024 * 1024, // 1048576 - matches Python VAE_IMAGE_SIZE
|
|
ImageMean: []float32{0.48145466, 0.4578275, 0.40821073},
|
|
ImageStd: []float32{0.26862954, 0.26130258, 0.27577711},
|
|
}
|
|
}
|
|
|
|
// Processor handles image preprocessing for Qwen-Image-Edit
|
|
type Processor struct {
|
|
Config *ProcessorConfig
|
|
}
|
|
|
|
// Load loads the processor config
|
|
func (p *Processor) Load(path string) error {
|
|
p.Config = defaultProcessorConfig()
|
|
return nil
|
|
}
|
|
|
|
// LoadAndPreprocess loads an image and preprocesses it for both paths
|
|
// Returns: condImage (for vision encoder), vaeImage (for VAE encoding)
|
|
func (p *Processor) LoadAndPreprocess(imagePath string) (*mlx.Array, *mlx.Array, error) {
|
|
img, err := loadImageFile(imagePath)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
bounds := img.Bounds()
|
|
origW := bounds.Dx()
|
|
origH := bounds.Dy()
|
|
ratio := float64(origW) / float64(origH)
|
|
|
|
// Calculate dimensions for condition image (vision encoder)
|
|
// Python pipeline does TWO resizes:
|
|
// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
|
|
// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
|
|
intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
|
|
finalH, finalW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)
|
|
|
|
// Calculate dimensions for VAE image (1024x1024 area)
|
|
// Use multiple of 32 (vae_scale_factor * patch_size * 2 = 8 * 2 * 2 = 32)
|
|
vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)
|
|
|
|
// Preprocess for condition (vision encoder) - two-step resize
|
|
condImage := p.preprocessImageTwoStep(img, intermediateW, intermediateH, finalW, finalH)
|
|
|
|
// Preprocess for VAE ([-1, 1] range, 5D tensor)
|
|
vaeImage := p.preprocessImageForVAE(img, vaeW, vaeH)
|
|
|
|
return condImage, vaeImage, nil
|
|
}
|
|
|
|
// preprocessImageLanczos does single-step Lanczos resize for vision encoder
|
|
// Matches Python VaeImageProcessor.resize with resample='lanczos' (the default)
|
|
// Used by edit_plus pipeline for multi-image input
|
|
// Returns: [B, C, H, W] normalized tensor
|
|
func (p *Processor) preprocessImageLanczos(img image.Image, width, height int32) *mlx.Array {
|
|
resized := resizeImageLanczos(img, int(width), int(height))
|
|
pixels := imageToFloat32Pixels(resized, int(width), int(height))
|
|
arr := mlx.NewArray(pixels, []int32{height, width, 3})
|
|
arr = p.normalizeImageNet(arr)
|
|
return prepareImageTensor(arr)
|
|
}
|
|
|
|
// preprocessImageTwoStep does two-step resize for vision encoder to match Python pipeline
|
|
// Step 1: Lanczos resize from original to intermediate size (VaeImageProcessor.resize)
|
|
// Step 2: Bicubic resize from intermediate to final size (Qwen2VLProcessor smart_resize)
|
|
// Returns: [B, C, H, W] normalized tensor
|
|
func (p *Processor) preprocessImageTwoStep(img image.Image, intermediateW, intermediateH, finalW, finalH int32) *mlx.Array {
|
|
intermediate := resizeImageLanczos(img, int(intermediateW), int(intermediateH))
|
|
resized := resizeImageBicubic(intermediate, int(finalW), int(finalH))
|
|
pixels := imageToFloat32Pixels(resized, int(finalW), int(finalH))
|
|
arr := mlx.NewArray(pixels, []int32{finalH, finalW, 3})
|
|
arr = p.normalizeImageNet(arr)
|
|
return prepareImageTensor(arr)
|
|
}
|
|
|
|
// preprocessImage converts image to tensor for vision encoder
|
|
// Returns: [B, C, H, W] normalized tensor
|
|
func (p *Processor) preprocessImage(img image.Image, width, height int32, normalize bool) *mlx.Array {
|
|
resized := resizeImageBicubic(img, int(width), int(height))
|
|
pixels := imageToFloat32Pixels(resized, int(width), int(height))
|
|
arr := mlx.NewArray(pixels, []int32{height, width, 3})
|
|
if normalize {
|
|
arr = p.normalizeImageNet(arr)
|
|
}
|
|
return prepareImageTensor(arr)
|
|
}
|
|
|
|
// preprocessImageForVAE converts image to tensor for VAE encoding
|
|
// Returns: [B, C, T, H, W] tensor in [-1, 1] range
|
|
func (p *Processor) preprocessImageForVAE(img image.Image, width, height int32) *mlx.Array {
|
|
resized := resizeImageLanczos(img, int(width), int(height))
|
|
pixels := imageToFloat32Pixels(resized, int(width), int(height))
|
|
arr := mlx.NewArray(pixels, []int32{height, width, 3})
|
|
|
|
// Scale to [-1, 1]: arr * 2 - 1
|
|
arr = mlx.MulScalar(arr, 2.0)
|
|
arr = mlx.AddScalar(arr, -1.0)
|
|
|
|
// Transpose to [C, H, W] and make contiguous
|
|
arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))
|
|
|
|
// Add batch and temporal dimensions [1, C, 1, H, W]
|
|
arr = mlx.ExpandDims(arr, 0) // [1, C, H, W]
|
|
arr = mlx.ExpandDims(arr, 2) // [1, C, 1, H, W]
|
|
|
|
arr = mlx.ToBFloat16(arr)
|
|
mlx.Eval(arr)
|
|
return arr
|
|
}
|
|
|
|
// smartResize implements Python Qwen2VL processor's smart_resize logic
|
|
// Returns (resizedHeight, resizedWidth) that fit within min/max pixel constraints
|
|
func smartResize(height, width, factor, minPixels, maxPixels int32) (int32, int32) {
|
|
// Round to factor
|
|
hBar := int32(math.Round(float64(height)/float64(factor))) * factor
|
|
wBar := int32(math.Round(float64(width)/float64(factor))) * factor
|
|
|
|
// Ensure minimum factor size
|
|
if hBar < factor {
|
|
hBar = factor
|
|
}
|
|
if wBar < factor {
|
|
wBar = factor
|
|
}
|
|
|
|
// Check pixel constraints
|
|
total := hBar * wBar
|
|
if total > maxPixels {
|
|
// Scale down
|
|
beta := math.Sqrt(float64(maxPixels) / float64(total))
|
|
hBar = int32(math.Floor(float64(height)*beta/float64(factor))) * factor
|
|
wBar = int32(math.Floor(float64(width)*beta/float64(factor))) * factor
|
|
} else if total < minPixels {
|
|
// Scale up
|
|
beta := math.Sqrt(float64(minPixels) / float64(total))
|
|
hBar = int32(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
|
wBar = int32(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
|
}
|
|
|
|
return hBar, wBar
|
|
}
|
|
|
|
// calculateDimensions calculates width and height for a target area while maintaining ratio
|
|
// multiple: the value to round dimensions to (e.g., 28 for vision encoder with patch 14 and 2x2 merge)
|
|
func calculateDimensions(targetArea int32, ratio float64, multiple int32) (int32, int32) {
|
|
width := math.Sqrt(float64(targetArea) * ratio)
|
|
height := width / ratio
|
|
|
|
m := float64(multiple)
|
|
width = math.Round(width/m) * m
|
|
height = math.Round(height/m) * m
|
|
|
|
// Ensure minimum dimensions
|
|
if width < m {
|
|
width = m
|
|
}
|
|
if height < m {
|
|
height = m
|
|
}
|
|
|
|
return int32(width), int32(height)
|
|
}
|
|
|
|
// resizeImageLanczos resizes an image using Lanczos3 interpolation (matches PIL.LANCZOS)
|
|
func resizeImageLanczos(img image.Image, width, height int) image.Image {
|
|
bounds := img.Bounds()
|
|
dst := image.NewRGBA(image.Rect(0, 0, width, height))
|
|
|
|
// Lanczos3 kernel (a=3) to match PIL.LANCZOS
|
|
lanczos3 := &draw.Kernel{
|
|
Support: 3.0,
|
|
At: func(t float64) float64 {
|
|
if t == 0 {
|
|
return 1.0
|
|
}
|
|
if t < 0 {
|
|
t = -t
|
|
}
|
|
if t >= 3.0 {
|
|
return 0.0
|
|
}
|
|
// sinc(t) * sinc(t/3)
|
|
piT := math.Pi * t
|
|
return (math.Sin(piT) / piT) * (math.Sin(piT/3) / (piT / 3))
|
|
},
|
|
}
|
|
lanczos3.Scale(dst, dst.Bounds(), img, bounds, draw.Over, nil)
|
|
|
|
return dst
|
|
}
|
|
|
|
// resizeImageBicubic resizes an image using bicubic interpolation (matches PIL.BICUBIC)
|
|
// Uses separable interpolation with PIL's coordinate mapping for exact match
|
|
func resizeImageBicubic(img image.Image, width, height int) image.Image {
|
|
bounds := img.Bounds()
|
|
srcW := bounds.Dx()
|
|
srcH := bounds.Dy()
|
|
|
|
// Convert to RGBA if needed
|
|
var src *image.RGBA
|
|
if rgba, ok := img.(*image.RGBA); ok {
|
|
src = rgba
|
|
} else {
|
|
src = image.NewRGBA(bounds)
|
|
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
|
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
|
src.Set(x, y, img.At(x, y))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Keys cubic with a=-0.5 (PIL BICUBIC)
|
|
cubic := func(x float64) float64 {
|
|
if x < 0 {
|
|
x = -x
|
|
}
|
|
if x < 1 {
|
|
return 1.5*x*x*x - 2.5*x*x + 1
|
|
}
|
|
if x < 2 {
|
|
return -0.5*x*x*x + 2.5*x*x - 4*x + 2
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// Horizontal pass: srcW -> width, keep srcH rows
|
|
temp := image.NewRGBA(image.Rect(0, 0, width, srcH))
|
|
for y := 0; y < srcH; y++ {
|
|
for dstX := 0; dstX < width; dstX++ {
|
|
// PIL coordinate mapping: center-to-center
|
|
srcXf := (float64(dstX)+0.5)*(float64(srcW)/float64(width)) - 0.5
|
|
baseX := int(math.Floor(srcXf))
|
|
|
|
var sumR, sumG, sumB, sumA, weightSum float64
|
|
for i := -1; i <= 2; i++ {
|
|
sx := baseX + i
|
|
if sx < 0 {
|
|
sx = 0
|
|
}
|
|
if sx >= srcW {
|
|
sx = srcW - 1
|
|
}
|
|
|
|
w := cubic(math.Abs(srcXf - float64(baseX+i)))
|
|
c := src.RGBAAt(sx, y)
|
|
sumR += float64(c.R) * w
|
|
sumG += float64(c.G) * w
|
|
sumB += float64(c.B) * w
|
|
sumA += float64(c.A) * w
|
|
weightSum += w
|
|
}
|
|
|
|
temp.SetRGBA(dstX, y, color.RGBA{
|
|
clampFloat(sumR, weightSum),
|
|
clampFloat(sumG, weightSum),
|
|
clampFloat(sumB, weightSum),
|
|
clampFloat(sumA, weightSum),
|
|
})
|
|
}
|
|
}
|
|
|
|
// Vertical pass: srcH -> height
|
|
dst := image.NewRGBA(image.Rect(0, 0, width, height))
|
|
for x := 0; x < width; x++ {
|
|
for dstY := 0; dstY < height; dstY++ {
|
|
srcYf := (float64(dstY)+0.5)*(float64(srcH)/float64(height)) - 0.5
|
|
baseY := int(math.Floor(srcYf))
|
|
|
|
var sumR, sumG, sumB, sumA, weightSum float64
|
|
for j := -1; j <= 2; j++ {
|
|
sy := baseY + j
|
|
if sy < 0 {
|
|
sy = 0
|
|
}
|
|
if sy >= srcH {
|
|
sy = srcH - 1
|
|
}
|
|
|
|
w := cubic(math.Abs(srcYf - float64(baseY+j)))
|
|
c := temp.RGBAAt(x, sy)
|
|
sumR += float64(c.R) * w
|
|
sumG += float64(c.G) * w
|
|
sumB += float64(c.B) * w
|
|
sumA += float64(c.A) * w
|
|
weightSum += w
|
|
}
|
|
|
|
dst.SetRGBA(x, dstY, color.RGBA{
|
|
clampFloat(sumR, weightSum),
|
|
clampFloat(sumG, weightSum),
|
|
clampFloat(sumB, weightSum),
|
|
clampFloat(sumA, weightSum),
|
|
})
|
|
}
|
|
}
|
|
|
|
return dst
|
|
}
|
|
|
|
// LoadAndPreprocessMultiple loads multiple images and preprocesses them
|
|
// Returns: condImages (for vision encoder), vaeImages (for VAE encoding), dims (per-image dimensions)
|
|
func (p *Processor) LoadAndPreprocessMultiple(imagePaths []string) ([]*mlx.Array, []*mlx.Array, []ImageDims, error) {
|
|
const vaeScaleFactor int32 = 8
|
|
const patchSize int32 = 2
|
|
|
|
condImages := make([]*mlx.Array, len(imagePaths))
|
|
vaeImages := make([]*mlx.Array, len(imagePaths))
|
|
dims := make([]ImageDims, len(imagePaths))
|
|
|
|
for i, imagePath := range imagePaths {
|
|
img, err := loadImageFile(imagePath)
|
|
if err != nil {
|
|
return nil, nil, nil, fmt.Errorf("image %d: %w", i, err)
|
|
}
|
|
|
|
bounds := img.Bounds()
|
|
origW := int32(bounds.Dx())
|
|
origH := int32(bounds.Dy())
|
|
ratio := float64(origW) / float64(origH)
|
|
|
|
// Calculate dimensions for condition image (vision encoder)
|
|
// Python pipeline does TWO resizes:
|
|
// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
|
|
// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
|
|
intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
|
|
condH, condW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)
|
|
|
|
// Calculate dimensions for VAE image (1024x1024 area)
|
|
vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)
|
|
|
|
// Calculate derived dimensions
|
|
latentW := vaeW / vaeScaleFactor
|
|
latentH := vaeH / vaeScaleFactor
|
|
patchW := latentW / patchSize
|
|
patchH := latentH / patchSize
|
|
|
|
dims[i] = ImageDims{
|
|
OrigW: origW,
|
|
OrigH: origH,
|
|
CondW: condW,
|
|
CondH: condH,
|
|
VaeW: vaeW,
|
|
VaeH: vaeH,
|
|
LatentW: latentW,
|
|
LatentH: latentH,
|
|
PatchW: patchW,
|
|
PatchH: patchH,
|
|
}
|
|
|
|
fmt.Printf(" Image %d: orig=%dx%d, cond=%dx%d, vae=%dx%d, latent=%dx%d, patch=%dx%d\n",
|
|
i+1, origW, origH, condW, condH, vaeW, vaeH, latentW, latentH, patchW, patchH)
|
|
|
|
// Preprocess for condition (vision encoder) - two-step resize to match Python pipeline
|
|
condImages[i] = p.preprocessImageTwoStep(img, intermediateW, intermediateH, condW, condH)
|
|
|
|
// Preprocess for VAE ([-1, 1] range, 5D tensor)
|
|
vaeImages[i] = p.preprocessImageForVAE(img, vaeW, vaeH)
|
|
}
|
|
|
|
return condImages, vaeImages, dims, nil
|
|
}
|