llm-gguf-tools/helpers/models/quantisation.py

"""Pydantic models for quantisation operations.

Contains data models specific to the quantisation workflow including
quantisation types, configurations, and results. Uses UK English spelling
conventions throughout (quantisation, not quantization).
"""

from __future__ import annotations

import re
from collections import defaultdict
from enum import StrEnum
from pathlib import Path  # noqa: TC003

from pydantic import BaseModel, ConfigDict, field_validator


class QuantisationType(StrEnum):
    """Available quantisation types for GGUF model conversion.

    Comprehensive set of quantisation strategies from Q2 to Q8, including
    K-quants and legacy formats. Each type represents different trade-offs
    between model size, inference speed, and quality preservation. Custom
    variants (L, XL, XXL) enable tensor-specific precision control for
    embeddings, attention layers, and feed-forward networks.
    """

    # Q2 variants (smallest, lowest quality)
    Q2_K = "Q2_K"
    Q2_K_S = "Q2_K_S"

    # Q3 K-quants
    Q3_K_S = "Q3_K_S"
    Q3_K_M = "Q3_K_M"  # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
    Q3_K_L = "Q3_K_L"  # Bartowski: Upgrades output to Q5_K (from M baseline)
    Q3_K_XL = "Q3_K_XL"  # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)

    # Q4 K-quants (most popular)
    Q4_K_S = "Q4_K_S"
    Q4_K_M = "Q4_K_M"  # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
    Q4_K_L = "Q4_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)

    # Q5 K-quants
    Q5_K_S = "Q5_K_S"
    Q5_K_M = "Q5_K_M"  # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
    Q5_K_L = "Q5_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)

    # Q6_K variants
    Q6_K = "Q6_K"
    Q6_K_L = "Q6_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)

    # Q8_0 (highest common quantisation)
    Q8_0 = "Q8_0"

    # Legacy quantisation formats
    Q4_0 = "Q4_0"
    Q4_1 = "Q4_1"
    Q5_0 = "Q5_0"
    Q5_1 = "Q5_1"


class URLType(StrEnum):
    """Supported URL formats for model source specification.

    Categorises input URL formats to enable appropriate handling strategies. HuggingFace URLs
    require full model download and conversion, whilst Ollama GGUF URLs allow direct GGUF file
    downloads with pattern matching for efficient processing of pre-quantised models.
    """

    HUGGINGFACE = "huggingface"
    OLLAMA_GGUF = "ollama_gguf"


class QuantisationConfig(BaseModel):
    """Configuration for a specific quantisation method.

    Defines quantisation parameters for different model variants. The L and XL variants specify a
    base type with optional embedding and output overrides, leveraging the fact that M variants
    already include strategic enhancements to critical layers (embeddings, attention V, and FFN
    down).
    """

    model_config = ConfigDict(use_enum_values=True)

    name: str
    description: str
    base_precision: int  # Base precision level (2, 3, 4, 5, 6, 8)
    base_type: str  # Base quantisation type for llama-cpp (e.g. "Q3_K_M")
    embedding_type: str | None = None  # Override for embeddings (e.g. "Q8_0")
    output_type: str | None = None  # Override for output layer (e.g. "Q5_K")
    inherent_enhancements: dict[str, str] | None = None  # M variant built-in enhancements

    def get_layer_config(self, configs_dict: dict | None = None) -> dict[str, str]:
        """Get layer configuration for display purposes.

        Returns layer precision specifications based on what the base_type inherently
        does (from inherent_enhancements) plus any L/XL overrides. This is purely
        for documentation and display - the actual quantisation uses base_type with
        tensor-specific overrides applied directly by the quantisation engine.

        Returns:
            Dictionary mapping layer types to quantisation specifications for display.
        """
        # Build base quantisation string from precision
        base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"

        # Get inherent enhancements for display - inherit from base type if this is L/XL variant
        enhancements = self.inherent_enhancements or {}

        # If this config has a base_type and no inherent enhancements, inherit for display
        if self.base_type and self.base_type != base and not enhancements and configs_dict:
            # Look up base type by string matching
            for config in configs_dict.values():
                if config.name == self.base_type:
                    if config.inherent_enhancements:
                        enhancements = config.inherent_enhancements
                    break

        # Start with what the base type inherently does
        embed = enhancements.get("embeddings", base)
        attn_v = enhancements.get("attention_v", base)
        ffn_down = enhancements.get("ffn_down", base)
        output = base  # Default output to base

        # Apply L/XL overrides for display (these take precedence in the display)
        embed = self.embedding_type or embed
        output = self.output_type or output

        # Build QKV string (Q/K always use base, V may be enhanced by base type)
        qkv = f"{base}/{base}/{attn_v}"

        return {
            "embed": embed,
            "output": output,
            "qkv": qkv,
            "gate_up": base,  # Gate and up always use base quantisation
            "down": ffn_down,  # Down uses what base type inherently does
        }

    def get_compact_config(self, configs_dict: dict | None = None) -> str:
        """Get compact configuration string with single-letter abbreviations.

        Creates a compact configuration string using E/O/A/F notation for
        embeddings, output, attention, and FFN layers respectively. This provides
        a concise representation of layer-specific quantisation levels for quick
        comparison and display in user interfaces.

        Returns:
            Formatted configuration string like "Q6:E Q4:O Q3:A Q5:F".
        """
        layers = self.get_layer_config(configs_dict)

        # Parse QKV values
        qkv_parts = layers["qkv"].split("/")
        q_val = qkv_parts[0] if qkv_parts else layers["qkv"]
        k_val = qkv_parts[1] if len(qkv_parts) > 1 else q_val
        v_val = qkv_parts[2] if len(qkv_parts) > 2 else k_val

        # Special case: uniform quantisation
        if (
            layers["embed"]
            == layers["output"]
            == q_val
            == k_val
            == v_val
            == layers["gate_up"]
            == layers["down"]
        ):
            if self.name == "Q6_K":
                return "Q6_K all layers"
            if self.name == "Q8_0":
                return "Q8_0 all layers"
            return f"{layers['embed']} all layers"

        # Build component groups
        quant_components = defaultdict(list)

        def add_component(value: str, component: str) -> None:
            if value:
                # Extract precision from quantisation string
                precision = self._extract_precision(value)
                quant_components[f"Q{precision}"].append(component)

        # Add components
        add_component(layers["embed"], "E")
        add_component(layers["output"], "O")

        # Attention components
        if q_val == k_val == v_val:
            add_component(q_val, "A")
        else:
            if q_val == k_val:
                add_component(q_val, "Aqk")
            else:
                add_component(q_val, "Aq")
                add_component(k_val, "Ak")
            add_component(v_val, "Av")

        # FFN components
        if layers["gate_up"] == layers["down"]:
            add_component(layers["gate_up"], "F")
        else:
            add_component(layers["gate_up"], "Fgu")
            add_component(layers["down"], "Fd")

        # Sort and format
        precision_order = ["Q8", "Q6", "Q5", "Q4", "Q3", "Q2"]
        component_order = ["E", "O", "A", "Av", "Aqk", "Aq", "Ak", "F", "Fd", "Fgu"]

        sorted_quants = sorted(
            quant_components.items(),
            key=lambda x: precision_order.index(x[0])
            if x[0] in precision_order
            else len(precision_order),
        )

        components = []
        for quant_level, parts in sorted_quants:
            sorted_parts = sorted(
                parts,
                key=lambda x: component_order.index(x)
                if x in component_order
                else len(component_order),
            )
            components.append(f"{quant_level}:{'/'.join(sorted_parts)}")

        return " ".join(components)

    def _extract_precision(self, quant_str: str) -> int:
        """Extract precision level from quantisation string.

        Parses quantisation type strings to extract the numerical precision level.
        Handles both K-quant formats (Q3_K, Q4_K_M) and legacy formats (Q8_0, Q5_1)
        by matching the digit following the Q prefix.

        Returns:
            Precision level as integer, defaulting to 4 if parsing fails.
        """
        # Extract the digit from strings like "Q3_K", "Q8_0", "Q6_K"
        match = re.search(r"Q(\d+)", quant_str)
        return int(match.group(1)) if match else 4  # Default to 4 if parsing fails


class ModelSource(BaseModel):
    """Represents a model source with parsed information from URL analysis.

    Contains comprehensive metadata extracted from model URLs including source
    repository details, author information, and GGUF file patterns. Enables
    differentiation between regular HuggingFace repositories requiring conversion
    and GGUF repositories allowing direct file downloads.
    """

    model_config = ConfigDict(use_enum_values=True, protected_namespaces=())

    url: str
    url_type: URLType
    source_model: str
    original_author: str
    model_name: str
    gguf_file_pattern: str | None = None
    is_gguf_repo: bool = False

    @field_validator("url")
    @classmethod
    def validate_url(cls, v: str) -> str:
        """Validate that URL is not empty.

        Ensures the provided URL string is not empty or None,
        as this is required for model source identification.

        Returns:
            The validated URL string.

        Raises:
            ValueError: If URL is empty or None.
        """
        if not v:
            msg = "URL cannot be empty"
            raise ValueError(msg)
        return v


class QuantisationResult(BaseModel):
    """Result of a quantisation operation with comprehensive status tracking.

    Captures the outcome of individual quantisation attempts including success
    status, file paths, sizes, and error details. Supports workflow status
    tracking from planning through processing to completion, enabling real-time
    progress reporting and parallel upload coordination.
    """

    model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)

    quantisation_type: QuantisationType
    success: bool
    file_path: Path | None = None
    file_size: str | None = None
    method_used: str | None = None
    error_message: str | None = None
    status: str = "pending"  # planned, processing, uploading, completed, failed


class QuantisationContext(BaseModel):
    """Context object containing all parameters needed for quantisation execution.

    Encapsulates quantisation parameters to reduce method argument counts
    and improve code maintainability following parameter object pattern.
    """

    model_config = ConfigDict(frozen=True, protected_namespaces=())

    f16_model_path: Path
    model_source: ModelSource
    config: QuantisationConfig
    models_dir: Path
    imatrix_path: Path | None = None
    base_quant: str = "Q4_K_M"

    def get_output_path(self) -> Path:
        """Generate output path for quantised model.

        Returns:
            Path to the output GGUF file.
        """
        output_filename = (
            f"{self.model_source.original_author}-"
            f"{self.model_source.model_name}-"
            f"{self.config.name}.gguf"
        )
        return self.models_dir / self.model_source.model_name / output_filename