ollama/x/imagegen/models/qwen_image_edit/processor.go

//go:build mlx

package qwen_image_edit

import (
	"fmt"
	"image"
	"image/color"
	_ "image/jpeg"
	_ "image/png"
	"math"
	"os"

	"github.com/ollama/ollama/x/imagegen/mlx"
	"golang.org/x/image/draw"
	_ "golang.org/x/image/webp"
)

// loadImageFile loads an image from disk
func loadImageFile(path string) (image.Image, error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, fmt.Errorf("open image: %w", err)
	}
	defer f.Close()

	img, _, err := image.Decode(f)
	if err != nil {
		return nil, fmt.Errorf("decode image: %w", err)
	}
	return img, nil
}

// imageToFloat32Pixels converts an image to a float32 pixel array [H, W, C] in [0, 1] range
func imageToFloat32Pixels(img image.Image, width, height int) []float32 {
	pixels := make([]float32, width*height*3)
	idx := 0
	for y := 0; y < height; y++ {
		for x := 0; x < width; x++ {
			r, g, b, _ := img.At(x, y).RGBA()
			pixels[idx] = float32(r) / 65535.0
			pixels[idx+1] = float32(g) / 65535.0
			pixels[idx+2] = float32(b) / 65535.0
			idx += 3
		}
	}
	return pixels
}

// normalizeImageNet applies ImageNet normalization to an image tensor
func (p *Processor) normalizeImageNet(arr *mlx.Array) *mlx.Array {
	mean := mlx.NewArray(p.Config.ImageMean, []int32{1, 1, 3})
	std := mlx.NewArray(p.Config.ImageStd, []int32{1, 1, 3})
	return mlx.Div(mlx.Sub(arr, mean), std)
}

// prepareImageTensor transforms [H, W, C] to [B, C, H, W] and converts to bf16
func prepareImageTensor(arr *mlx.Array) *mlx.Array {
	// Transpose to [C, H, W] and make contiguous
	arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))
	// Add batch dimension [1, C, H, W]
	arr = mlx.ExpandDims(arr, 0)
	// Convert to bf16
	arr = mlx.ToBFloat16(arr)
	mlx.Eval(arr)
	return arr
}

// clampFloat clamps a value to [0, 255] and returns uint8
func clampFloat(v, weightSum float64) uint8 {
	v /= weightSum
	if v < 0 {
		v = 0
	}
	if v > 255 {
		v = 255
	}
	return uint8(math.Round(v))
}

// ImageDims holds dimensions for a preprocessed image
type ImageDims struct {
	// Original image dimensions
	OrigW, OrigH int32
	// Condition image dimensions (for vision encoder)
	CondW, CondH int32
	// VAE image dimensions
	VaeW, VaeH int32
	// Latent dimensions (VAE dims / vae_scale_factor)
	LatentW, LatentH int32
	// Patch dimensions (latent dims / patch_size)
	PatchW, PatchH int32
}

// ProcessorConfig holds image processor configuration
type ProcessorConfig struct {
	// Condition image size (target pixel area for vision encoder input)
	// Python: CONDITION_IMAGE_SIZE = 384 * 384 = 147456
	// Pipeline resizes image to this area before passing to encode_prompt
	ConditionImageSize int32

	// VAE image size (target pixel area)
	// Python: VAE_IMAGE_SIZE = 1024 * 1024 = 1048576
	VAEImageSize int32

	// Image normalization (ImageNet stats for vision encoder)
	ImageMean []float32
	ImageStd  []float32
}

// defaultProcessorConfig returns default processor config
func defaultProcessorConfig() *ProcessorConfig {
	return &ProcessorConfig{
		ConditionImageSize: 384 * 384,   // 147456 - matches Python CONDITION_IMAGE_SIZE
		VAEImageSize:       1024 * 1024, // 1048576 - matches Python VAE_IMAGE_SIZE
		ImageMean:          []float32{0.48145466, 0.4578275, 0.40821073},
		ImageStd:           []float32{0.26862954, 0.26130258, 0.27577711},
	}
}

// Processor handles image preprocessing for Qwen-Image-Edit
type Processor struct {
	Config *ProcessorConfig
}

// Load loads the processor config
func (p *Processor) Load(path string) error {
	p.Config = defaultProcessorConfig()
	return nil
}

// LoadAndPreprocess loads an image and preprocesses it for both paths
// Returns: condImage (for vision encoder), vaeImage (for VAE encoding)
func (p *Processor) LoadAndPreprocess(imagePath string) (*mlx.Array, *mlx.Array, error) {
	img, err := loadImageFile(imagePath)
	if err != nil {
		return nil, nil, err
	}

	bounds := img.Bounds()
	origW := bounds.Dx()
	origH := bounds.Dy()
	ratio := float64(origW) / float64(origH)

	// Calculate dimensions for condition image (vision encoder)
	// Python pipeline does TWO resizes:
	// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
	// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
	intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
	finalH, finalW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)

	// Calculate dimensions for VAE image (1024x1024 area)
	// Use multiple of 32 (vae_scale_factor * patch_size * 2 = 8 * 2 * 2 = 32)
	vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)

	// Preprocess for condition (vision encoder) - two-step resize
	condImage := p.preprocessImageTwoStep(img, intermediateW, intermediateH, finalW, finalH)

	// Preprocess for VAE ([-1, 1] range, 5D tensor)
	vaeImage := p.preprocessImageForVAE(img, vaeW, vaeH)

	return condImage, vaeImage, nil
}

// preprocessImageLanczos does single-step Lanczos resize for vision encoder
// Matches Python VaeImageProcessor.resize with resample='lanczos' (the default)
// Used by edit_plus pipeline for multi-image input
// Returns: [B, C, H, W] normalized tensor
func (p *Processor) preprocessImageLanczos(img image.Image, width, height int32) *mlx.Array {
	resized := resizeImageLanczos(img, int(width), int(height))
	pixels := imageToFloat32Pixels(resized, int(width), int(height))
	arr := mlx.NewArray(pixels, []int32{height, width, 3})
	arr = p.normalizeImageNet(arr)
	return prepareImageTensor(arr)
}

// preprocessImageTwoStep does two-step resize for vision encoder to match Python pipeline
// Step 1: Lanczos resize from original to intermediate size (VaeImageProcessor.resize)
// Step 2: Bicubic resize from intermediate to final size (Qwen2VLProcessor smart_resize)
// Returns: [B, C, H, W] normalized tensor
func (p *Processor) preprocessImageTwoStep(img image.Image, intermediateW, intermediateH, finalW, finalH int32) *mlx.Array {
	intermediate := resizeImageLanczos(img, int(intermediateW), int(intermediateH))
	resized := resizeImageBicubic(intermediate, int(finalW), int(finalH))
	pixels := imageToFloat32Pixels(resized, int(finalW), int(finalH))
	arr := mlx.NewArray(pixels, []int32{finalH, finalW, 3})
	arr = p.normalizeImageNet(arr)
	return prepareImageTensor(arr)
}

// preprocessImage converts image to tensor for vision encoder
// Returns: [B, C, H, W] normalized tensor
func (p *Processor) preprocessImage(img image.Image, width, height int32, normalize bool) *mlx.Array {
	resized := resizeImageBicubic(img, int(width), int(height))
	pixels := imageToFloat32Pixels(resized, int(width), int(height))
	arr := mlx.NewArray(pixels, []int32{height, width, 3})
	if normalize {
		arr = p.normalizeImageNet(arr)
	}
	return prepareImageTensor(arr)
}

// preprocessImageForVAE converts image to tensor for VAE encoding
// Returns: [B, C, T, H, W] tensor in [-1, 1] range
func (p *Processor) preprocessImageForVAE(img image.Image, width, height int32) *mlx.Array {
	resized := resizeImageLanczos(img, int(width), int(height))
	pixels := imageToFloat32Pixels(resized, int(width), int(height))
	arr := mlx.NewArray(pixels, []int32{height, width, 3})

	// Scale to [-1, 1]: arr * 2 - 1
	arr = mlx.MulScalar(arr, 2.0)
	arr = mlx.AddScalar(arr, -1.0)

	// Transpose to [C, H, W] and make contiguous
	arr = mlx.Contiguous(mlx.Transpose(arr, 2, 0, 1))

	// Add batch and temporal dimensions [1, C, 1, H, W]
	arr = mlx.ExpandDims(arr, 0) // [1, C, H, W]
	arr = mlx.ExpandDims(arr, 2) // [1, C, 1, H, W]

	arr = mlx.ToBFloat16(arr)
	mlx.Eval(arr)
	return arr
}

// smartResize implements Python Qwen2VL processor's smart_resize logic
// Returns (resizedHeight, resizedWidth) that fit within min/max pixel constraints
func smartResize(height, width, factor, minPixels, maxPixels int32) (int32, int32) {
	// Round to factor
	hBar := int32(math.Round(float64(height)/float64(factor))) * factor
	wBar := int32(math.Round(float64(width)/float64(factor))) * factor

	// Ensure minimum factor size
	if hBar < factor {
		hBar = factor
	}
	if wBar < factor {
		wBar = factor
	}

	// Check pixel constraints
	total := hBar * wBar
	if total > maxPixels {
		// Scale down
		beta := math.Sqrt(float64(maxPixels) / float64(total))
		hBar = int32(math.Floor(float64(height)*beta/float64(factor))) * factor
		wBar = int32(math.Floor(float64(width)*beta/float64(factor))) * factor
	} else if total < minPixels {
		// Scale up
		beta := math.Sqrt(float64(minPixels) / float64(total))
		hBar = int32(math.Ceil(float64(height)*beta/float64(factor))) * factor
		wBar = int32(math.Ceil(float64(width)*beta/float64(factor))) * factor
	}

	return hBar, wBar
}

// calculateDimensions calculates width and height for a target area while maintaining ratio
// multiple: the value to round dimensions to (e.g., 28 for vision encoder with patch 14 and 2x2 merge)
func calculateDimensions(targetArea int32, ratio float64, multiple int32) (int32, int32) {
	width := math.Sqrt(float64(targetArea) * ratio)
	height := width / ratio

	m := float64(multiple)
	width = math.Round(width/m) * m
	height = math.Round(height/m) * m

	// Ensure minimum dimensions
	if width < m {
		width = m
	}
	if height < m {
		height = m
	}

	return int32(width), int32(height)
}

// resizeImageLanczos resizes an image using Lanczos3 interpolation (matches PIL.LANCZOS)
func resizeImageLanczos(img image.Image, width, height int) image.Image {
	bounds := img.Bounds()
	dst := image.NewRGBA(image.Rect(0, 0, width, height))

	// Lanczos3 kernel (a=3) to match PIL.LANCZOS
	lanczos3 := &draw.Kernel{
		Support: 3.0,
		At: func(t float64) float64 {
			if t == 0 {
				return 1.0
			}
			if t < 0 {
				t = -t
			}
			if t >= 3.0 {
				return 0.0
			}
			// sinc(t) * sinc(t/3)
			piT := math.Pi * t
			return (math.Sin(piT) / piT) * (math.Sin(piT/3) / (piT / 3))
		},
	}
	lanczos3.Scale(dst, dst.Bounds(), img, bounds, draw.Over, nil)

	return dst
}

// resizeImageBicubic resizes an image using bicubic interpolation (matches PIL.BICUBIC)
// Uses separable interpolation with PIL's coordinate mapping for exact match
func resizeImageBicubic(img image.Image, width, height int) image.Image {
	bounds := img.Bounds()
	srcW := bounds.Dx()
	srcH := bounds.Dy()

	// Convert to RGBA if needed
	var src *image.RGBA
	if rgba, ok := img.(*image.RGBA); ok {
		src = rgba
	} else {
		src = image.NewRGBA(bounds)
		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
			for x := bounds.Min.X; x < bounds.Max.X; x++ {
				src.Set(x, y, img.At(x, y))
			}
		}
	}

	// Keys cubic with a=-0.5 (PIL BICUBIC)
	cubic := func(x float64) float64 {
		if x < 0 {
			x = -x
		}
		if x < 1 {
			return 1.5*x*x*x - 2.5*x*x + 1
		}
		if x < 2 {
			return -0.5*x*x*x + 2.5*x*x - 4*x + 2
		}
		return 0
	}

	// Horizontal pass: srcW -> width, keep srcH rows
	temp := image.NewRGBA(image.Rect(0, 0, width, srcH))
	for y := 0; y < srcH; y++ {
		for dstX := 0; dstX < width; dstX++ {
			// PIL coordinate mapping: center-to-center
			srcXf := (float64(dstX)+0.5)*(float64(srcW)/float64(width)) - 0.5
			baseX := int(math.Floor(srcXf))

			var sumR, sumG, sumB, sumA, weightSum float64
			for i := -1; i <= 2; i++ {
				sx := baseX + i
				if sx < 0 {
					sx = 0
				}
				if sx >= srcW {
					sx = srcW - 1
				}

				w := cubic(math.Abs(srcXf - float64(baseX+i)))
				c := src.RGBAAt(sx, y)
				sumR += float64(c.R) * w
				sumG += float64(c.G) * w
				sumB += float64(c.B) * w
				sumA += float64(c.A) * w
				weightSum += w
			}

			temp.SetRGBA(dstX, y, color.RGBA{
				clampFloat(sumR, weightSum),
				clampFloat(sumG, weightSum),
				clampFloat(sumB, weightSum),
				clampFloat(sumA, weightSum),
			})
		}
	}

	// Vertical pass: srcH -> height
	dst := image.NewRGBA(image.Rect(0, 0, width, height))
	for x := 0; x < width; x++ {
		for dstY := 0; dstY < height; dstY++ {
			srcYf := (float64(dstY)+0.5)*(float64(srcH)/float64(height)) - 0.5
			baseY := int(math.Floor(srcYf))

			var sumR, sumG, sumB, sumA, weightSum float64
			for j := -1; j <= 2; j++ {
				sy := baseY + j
				if sy < 0 {
					sy = 0
				}
				if sy >= srcH {
					sy = srcH - 1
				}

				w := cubic(math.Abs(srcYf - float64(baseY+j)))
				c := temp.RGBAAt(x, sy)
				sumR += float64(c.R) * w
				sumG += float64(c.G) * w
				sumB += float64(c.B) * w
				sumA += float64(c.A) * w
				weightSum += w
			}

			dst.SetRGBA(x, dstY, color.RGBA{
				clampFloat(sumR, weightSum),
				clampFloat(sumG, weightSum),
				clampFloat(sumB, weightSum),
				clampFloat(sumA, weightSum),
			})
		}
	}

	return dst
}

// LoadAndPreprocessMultiple loads multiple images and preprocesses them
// Returns: condImages (for vision encoder), vaeImages (for VAE encoding), dims (per-image dimensions)
func (p *Processor) LoadAndPreprocessMultiple(imagePaths []string) ([]*mlx.Array, []*mlx.Array, []ImageDims, error) {
	const vaeScaleFactor int32 = 8
	const patchSize int32 = 2

	condImages := make([]*mlx.Array, len(imagePaths))
	vaeImages := make([]*mlx.Array, len(imagePaths))
	dims := make([]ImageDims, len(imagePaths))

	for i, imagePath := range imagePaths {
		img, err := loadImageFile(imagePath)
		if err != nil {
			return nil, nil, nil, fmt.Errorf("image %d: %w", i, err)
		}

		bounds := img.Bounds()
		origW := int32(bounds.Dx())
		origH := int32(bounds.Dy())
		ratio := float64(origW) / float64(origH)

		// Calculate dimensions for condition image (vision encoder)
		// Python pipeline does TWO resizes:
		// 1. VaeImageProcessor.resize with Lanczos to CONDITION_IMAGE_SIZE (384x384 area)
		// 2. Qwen2VLProcessor's smart_resize with Bicubic to multiple of 28
		intermediateW, intermediateH := calculateDimensions(p.Config.ConditionImageSize, ratio, 32)
		condH, condW := smartResize(intermediateH, intermediateW, 28, 56*56, 28*28*1280)

		// Calculate dimensions for VAE image (1024x1024 area)
		vaeW, vaeH := calculateDimensions(p.Config.VAEImageSize, ratio, 32)

		// Calculate derived dimensions
		latentW := vaeW / vaeScaleFactor
		latentH := vaeH / vaeScaleFactor
		patchW := latentW / patchSize
		patchH := latentH / patchSize

		dims[i] = ImageDims{
			OrigW:   origW,
			OrigH:   origH,
			CondW:   condW,
			CondH:   condH,
			VaeW:    vaeW,
			VaeH:    vaeH,
			LatentW: latentW,
			LatentH: latentH,
			PatchW:  patchW,
			PatchH:  patchH,
		}

		fmt.Printf("  Image %d: orig=%dx%d, cond=%dx%d, vae=%dx%d, latent=%dx%d, patch=%dx%d\n",
			i+1, origW, origH, condW, condH, vaeW, vaeH, latentW, latentH, patchW, patchH)

		// Preprocess for condition (vision encoder) - two-step resize to match Python pipeline
		condImages[i] = p.preprocessImageTwoStep(img, intermediateW, intermediateH, condW, condH)

		// Preprocess for VAE ([-1, 1] range, 5D tensor)
		vaeImages[i] = p.preprocessImageForVAE(img, vaeW, vaeH)
	}

	return condImages, vaeImages, dims, nil
}