"""Pydantic models for quantisation operations. Contains data models specific to the quantisation workflow including quantisation types, configurations, and results. Uses UK English spelling conventions throughout (quantisation, not quantization). """ from __future__ import annotations import re from collections import defaultdict from enum import StrEnum from pathlib import Path # noqa: TC003 from pydantic import BaseModel, ConfigDict, field_validator class QuantisationType(StrEnum): """Available quantisation types for GGUF model conversion. Comprehensive set of quantisation strategies from Q2 to Q8, including K-quants and legacy formats. Each type represents different trade-offs between model size, inference speed, and quality preservation. Custom variants (L, XL, XXL) enable tensor-specific precision control for embeddings, attention layers, and feed-forward networks. """ # Q2 variants (smallest, lowest quality) Q2_K = "Q2_K" Q2_K_S = "Q2_K_S" # Q3 K-quants Q3_K_S = "Q3_K_S" Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline) Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline) # Q4 K-quants (most popular) Q4_K_S = "Q4_K_S" Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline) # Q5 K-quants Q5_K_S = "Q5_K_S" Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline) # Q6_K variants Q6_K = "Q6_K" Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K) # Q8_0 (highest common quantisation) Q8_0 = "Q8_0" # Legacy quantisation formats Q4_0 = "Q4_0" Q4_1 = "Q4_1" Q5_0 = "Q5_0" Q5_1 = "Q5_1" class URLType(StrEnum): """Supported URL formats for model source specification. Categorises input URL formats to enable appropriate handling strategies. HuggingFace URLs require full model download and conversion, whilst Ollama GGUF URLs allow direct GGUF file downloads with pattern matching for efficient processing of pre-quantised models. """ HUGGINGFACE = "huggingface" OLLAMA_GGUF = "ollama_gguf" class QuantisationConfig(BaseModel): """Configuration for a specific quantisation method. Defines quantisation parameters for different model variants. The L and XL variants specify a base type with optional embedding and output overrides, leveraging the fact that M variants already include strategic enhancements to critical layers (embeddings, attention V, and FFN down). """ model_config = ConfigDict(use_enum_values=True) name: str description: str base_precision: int # Base precision level (2, 3, 4, 5, 6, 8) base_type: str # Base quantisation type for llama-cpp (e.g. "Q3_K_M") embedding_type: str | None = None # Override for embeddings (e.g. "Q8_0") output_type: str | None = None # Override for output layer (e.g. "Q5_K") inherent_enhancements: dict[str, str] | None = None # M variant built-in enhancements def get_layer_config(self, configs_dict: dict | None = None) -> dict[str, str]: """Get layer configuration for display purposes. Returns layer precision specifications based on what the base_type inherently does (from inherent_enhancements) plus any L/XL overrides. This is purely for documentation and display - the actual quantisation uses base_type with tensor-specific overrides applied directly by the quantisation engine. Returns: Dictionary mapping layer types to quantisation specifications for display. """ # Build base quantisation string from precision base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0" # Get inherent enhancements for display - inherit from base type if this is L/XL variant enhancements = self.inherent_enhancements or {} # If this config has a base_type and no inherent enhancements, inherit for display if self.base_type and self.base_type != base and not enhancements and configs_dict: # Look up base type by string matching for config in configs_dict.values(): if config.name == self.base_type: if config.inherent_enhancements: enhancements = config.inherent_enhancements break # Start with what the base type inherently does embed = enhancements.get("embeddings", base) attn_v = enhancements.get("attention_v", base) ffn_down = enhancements.get("ffn_down", base) output = base # Default output to base # Apply L/XL overrides for display (these take precedence in the display) embed = self.embedding_type or embed output = self.output_type or output # Build QKV string (Q/K always use base, V may be enhanced by base type) qkv = f"{base}/{base}/{attn_v}" return { "embed": embed, "output": output, "qkv": qkv, "gate_up": base, # Gate and up always use base quantisation "down": ffn_down, # Down uses what base type inherently does } def get_compact_config(self, configs_dict: dict | None = None) -> str: """Get compact configuration string with single-letter abbreviations. Creates a compact configuration string using E/O/A/F notation for embeddings, output, attention, and FFN layers respectively. This provides a concise representation of layer-specific quantisation levels for quick comparison and display in user interfaces. Returns: Formatted configuration string like "Q6:E Q4:O Q3:A Q5:F". """ layers = self.get_layer_config(configs_dict) # Parse QKV values qkv_parts = layers["qkv"].split("/") q_val = qkv_parts[0] if qkv_parts else layers["qkv"] k_val = qkv_parts[1] if len(qkv_parts) > 1 else q_val v_val = qkv_parts[2] if len(qkv_parts) > 2 else k_val # Special case: uniform quantisation if ( layers["embed"] == layers["output"] == q_val == k_val == v_val == layers["gate_up"] == layers["down"] ): if self.name == "Q6_K": return "Q6_K all layers" if self.name == "Q8_0": return "Q8_0 all layers" return f"{layers['embed']} all layers" # Build component groups quant_components = defaultdict(list) def add_component(value: str, component: str) -> None: if value: # Extract precision from quantisation string precision = self._extract_precision(value) quant_components[f"Q{precision}"].append(component) # Add components add_component(layers["embed"], "E") add_component(layers["output"], "O") # Attention components if q_val == k_val == v_val: add_component(q_val, "A") else: if q_val == k_val: add_component(q_val, "Aqk") else: add_component(q_val, "Aq") add_component(k_val, "Ak") add_component(v_val, "Av") # FFN components if layers["gate_up"] == layers["down"]: add_component(layers["gate_up"], "F") else: add_component(layers["gate_up"], "Fgu") add_component(layers["down"], "Fd") # Sort and format precision_order = ["Q8", "Q6", "Q5", "Q4", "Q3", "Q2"] component_order = ["E", "O", "A", "Av", "Aqk", "Aq", "Ak", "F", "Fd", "Fgu"] sorted_quants = sorted( quant_components.items(), key=lambda x: precision_order.index(x[0]) if x[0] in precision_order else len(precision_order), ) components = [] for quant_level, parts in sorted_quants: sorted_parts = sorted( parts, key=lambda x: component_order.index(x) if x in component_order else len(component_order), ) components.append(f"{quant_level}:{'/'.join(sorted_parts)}") return " ".join(components) def _extract_precision(self, quant_str: str) -> int: """Extract precision level from quantisation string. Parses quantisation type strings to extract the numerical precision level. Handles both K-quant formats (Q3_K, Q4_K_M) and legacy formats (Q8_0, Q5_1) by matching the digit following the Q prefix. Returns: Precision level as integer, defaulting to 4 if parsing fails. """ # Extract the digit from strings like "Q3_K", "Q8_0", "Q6_K" match = re.search(r"Q(\d+)", quant_str) return int(match.group(1)) if match else 4 # Default to 4 if parsing fails class ModelSource(BaseModel): """Represents a model source with parsed information from URL analysis. Contains comprehensive metadata extracted from model URLs including source repository details, author information, and GGUF file patterns. Enables differentiation between regular HuggingFace repositories requiring conversion and GGUF repositories allowing direct file downloads. """ model_config = ConfigDict(use_enum_values=True, protected_namespaces=()) url: str url_type: URLType source_model: str original_author: str model_name: str gguf_file_pattern: str | None = None is_gguf_repo: bool = False @field_validator("url") @classmethod def validate_url(cls, v: str) -> str: """Validate that URL is not empty. Ensures the provided URL string is not empty or None, as this is required for model source identification. Returns: The validated URL string. Raises: ValueError: If URL is empty or None. """ if not v: msg = "URL cannot be empty" raise ValueError(msg) return v class QuantisationResult(BaseModel): """Result of a quantisation operation with comprehensive status tracking. Captures the outcome of individual quantisation attempts including success status, file paths, sizes, and error details. Supports workflow status tracking from planning through processing to completion, enabling real-time progress reporting and parallel upload coordination. """ model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True) quantisation_type: QuantisationType success: bool file_path: Path | None = None file_size: str | None = None method_used: str | None = None error_message: str | None = None status: str = "pending" # planned, processing, uploading, completed, failed class QuantisationContext(BaseModel): """Context object containing all parameters needed for quantisation execution. Encapsulates quantisation parameters to reduce method argument counts and improve code maintainability following parameter object pattern. """ model_config = ConfigDict(frozen=True, protected_namespaces=()) f16_model_path: Path model_source: ModelSource config: QuantisationConfig models_dir: Path imatrix_path: Path | None = None base_quant: str = "Q4_K_M" def get_output_path(self) -> Path: """Generate output path for quantised model. Returns: Path to the output GGUF file. """ output_filename = ( f"{self.model_source.original_author}-" f"{self.model_source.model_name}-" f"{self.config.name}.gguf" ) return self.models_dir / self.model_source.model_name / output_filename