llm-gguf-tools/helpers/models/quantisation.py

330 lines
12 KiB
Python

"""Pydantic models for quantisation operations.
Contains data models specific to the quantisation workflow including
quantisation types, configurations, and results. Uses UK English spelling
conventions throughout (quantisation, not quantization).
"""
from __future__ import annotations
import re
from collections import defaultdict
from enum import StrEnum
from pathlib import Path # noqa: TC003
from pydantic import BaseModel, ConfigDict, field_validator
class QuantisationType(StrEnum):
"""Available quantisation types for GGUF model conversion.
Comprehensive set of quantisation strategies from Q2 to Q8, including
K-quants and legacy formats. Each type represents different trade-offs
between model size, inference speed, and quality preservation. Custom
variants (L, XL, XXL) enable tensor-specific precision control for
embeddings, attention layers, and feed-forward networks.
"""
# Q2 variants (smallest, lowest quality)
Q2_K = "Q2_K"
Q2_K_S = "Q2_K_S"
# Q3 K-quants
Q3_K_S = "Q3_K_S"
Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline)
Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)
# Q4 K-quants (most popular)
Q4_K_S = "Q4_K_S"
Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
# Q5 K-quants
Q5_K_S = "Q5_K_S"
Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
# Q6_K variants
Q6_K = "Q6_K"
Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)
# Q8_0 (highest common quantisation)
Q8_0 = "Q8_0"
# Legacy quantisation formats
Q4_0 = "Q4_0"
Q4_1 = "Q4_1"
Q5_0 = "Q5_0"
Q5_1 = "Q5_1"
class URLType(StrEnum):
"""Supported URL formats for model source specification.
Categorises input URL formats to enable appropriate handling strategies. HuggingFace URLs
require full model download and conversion, whilst Ollama GGUF URLs allow direct GGUF file
downloads with pattern matching for efficient processing of pre-quantised models.
"""
HUGGINGFACE = "huggingface"
OLLAMA_GGUF = "ollama_gguf"
class QuantisationConfig(BaseModel):
"""Configuration for a specific quantisation method.
Defines quantisation parameters for different model variants. The L and XL variants specify a
base type with optional embedding and output overrides, leveraging the fact that M variants
already include strategic enhancements to critical layers (embeddings, attention V, and FFN
down).
"""
model_config = ConfigDict(use_enum_values=True)
name: str
description: str
base_precision: int # Base precision level (2, 3, 4, 5, 6, 8)
base_type: str # Base quantisation type for llama-cpp (e.g. "Q3_K_M")
embedding_type: str | None = None # Override for embeddings (e.g. "Q8_0")
output_type: str | None = None # Override for output layer (e.g. "Q5_K")
inherent_enhancements: dict[str, str] | None = None # M variant built-in enhancements
def get_layer_config(self, configs_dict: dict | None = None) -> dict[str, str]:
"""Get layer configuration for display purposes.
Returns layer precision specifications based on what the base_type inherently
does (from inherent_enhancements) plus any L/XL overrides. This is purely
for documentation and display - the actual quantisation uses base_type with
tensor-specific overrides applied directly by the quantisation engine.
Returns:
Dictionary mapping layer types to quantisation specifications for display.
"""
# Build base quantisation string from precision
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
# Get inherent enhancements for display - inherit from base type if this is L/XL variant
enhancements = self.inherent_enhancements or {}
# If this config has a base_type and no inherent enhancements, inherit for display
if self.base_type and self.base_type != base and not enhancements and configs_dict:
# Look up base type by string matching
for config in configs_dict.values():
if config.name == self.base_type:
if config.inherent_enhancements:
enhancements = config.inherent_enhancements
break
# Start with what the base type inherently does
embed = enhancements.get("embeddings", base)
attn_v = enhancements.get("attention_v", base)
ffn_down = enhancements.get("ffn_down", base)
output = base # Default output to base
# Apply L/XL overrides for display (these take precedence in the display)
embed = self.embedding_type or embed
output = self.output_type or output
# Build QKV string (Q/K always use base, V may be enhanced by base type)
qkv = f"{base}/{base}/{attn_v}"
return {
"embed": embed,
"output": output,
"qkv": qkv,
"gate_up": base, # Gate and up always use base quantisation
"down": ffn_down, # Down uses what base type inherently does
}
def get_compact_config(self, configs_dict: dict | None = None) -> str:
"""Get compact configuration string with single-letter abbreviations.
Creates a compact configuration string using E/O/A/F notation for
embeddings, output, attention, and FFN layers respectively. This provides
a concise representation of layer-specific quantisation levels for quick
comparison and display in user interfaces.
Returns:
Formatted configuration string like "Q6:E Q4:O Q3:A Q5:F".
"""
layers = self.get_layer_config(configs_dict)
# Parse QKV values
qkv_parts = layers["qkv"].split("/")
q_val = qkv_parts[0] if qkv_parts else layers["qkv"]
k_val = qkv_parts[1] if len(qkv_parts) > 1 else q_val
v_val = qkv_parts[2] if len(qkv_parts) > 2 else k_val
# Special case: uniform quantisation
if (
layers["embed"]
== layers["output"]
== q_val
== k_val
== v_val
== layers["gate_up"]
== layers["down"]
):
if self.name == "Q6_K":
return "Q6_K all layers"
if self.name == "Q8_0":
return "Q8_0 all layers"
return f"{layers['embed']} all layers"
# Build component groups
quant_components = defaultdict(list)
def add_component(value: str, component: str) -> None:
if value:
# Extract precision from quantisation string
precision = self._extract_precision(value)
quant_components[f"Q{precision}"].append(component)
# Add components
add_component(layers["embed"], "E")
add_component(layers["output"], "O")
# Attention components
if q_val == k_val == v_val:
add_component(q_val, "A")
else:
if q_val == k_val:
add_component(q_val, "Aqk")
else:
add_component(q_val, "Aq")
add_component(k_val, "Ak")
add_component(v_val, "Av")
# FFN components
if layers["gate_up"] == layers["down"]:
add_component(layers["gate_up"], "F")
else:
add_component(layers["gate_up"], "Fgu")
add_component(layers["down"], "Fd")
# Sort and format
precision_order = ["Q8", "Q6", "Q5", "Q4", "Q3", "Q2"]
component_order = ["E", "O", "A", "Av", "Aqk", "Aq", "Ak", "F", "Fd", "Fgu"]
sorted_quants = sorted(
quant_components.items(),
key=lambda x: precision_order.index(x[0])
if x[0] in precision_order
else len(precision_order),
)
components = []
for quant_level, parts in sorted_quants:
sorted_parts = sorted(
parts,
key=lambda x: component_order.index(x)
if x in component_order
else len(component_order),
)
components.append(f"{quant_level}:{'/'.join(sorted_parts)}")
return " ".join(components)
def _extract_precision(self, quant_str: str) -> int:
"""Extract precision level from quantisation string.
Parses quantisation type strings to extract the numerical precision level.
Handles both K-quant formats (Q3_K, Q4_K_M) and legacy formats (Q8_0, Q5_1)
by matching the digit following the Q prefix.
Returns:
Precision level as integer, defaulting to 4 if parsing fails.
"""
# Extract the digit from strings like "Q3_K", "Q8_0", "Q6_K"
match = re.search(r"Q(\d+)", quant_str)
return int(match.group(1)) if match else 4 # Default to 4 if parsing fails
class ModelSource(BaseModel):
"""Represents a model source with parsed information from URL analysis.
Contains comprehensive metadata extracted from model URLs including source
repository details, author information, and GGUF file patterns. Enables
differentiation between regular HuggingFace repositories requiring conversion
and GGUF repositories allowing direct file downloads.
"""
model_config = ConfigDict(use_enum_values=True, protected_namespaces=())
url: str
url_type: URLType
source_model: str
original_author: str
model_name: str
gguf_file_pattern: str | None = None
is_gguf_repo: bool = False
@field_validator("url")
@classmethod
def validate_url(cls, v: str) -> str:
"""Validate that URL is not empty.
Ensures the provided URL string is not empty or None,
as this is required for model source identification.
Returns:
The validated URL string.
Raises:
ValueError: If URL is empty or None.
"""
if not v:
msg = "URL cannot be empty"
raise ValueError(msg)
return v
class QuantisationResult(BaseModel):
"""Result of a quantisation operation with comprehensive status tracking.
Captures the outcome of individual quantisation attempts including success
status, file paths, sizes, and error details. Supports workflow status
tracking from planning through processing to completion, enabling real-time
progress reporting and parallel upload coordination.
"""
model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)
quantisation_type: QuantisationType
success: bool
file_path: Path | None = None
file_size: str | None = None
method_used: str | None = None
error_message: str | None = None
status: str = "pending" # planned, processing, uploading, completed, failed
class QuantisationContext(BaseModel):
"""Context object containing all parameters needed for quantisation execution.
Encapsulates quantisation parameters to reduce method argument counts
and improve code maintainability following parameter object pattern.
"""
model_config = ConfigDict(frozen=True, protected_namespaces=())
f16_model_path: Path
model_source: ModelSource
config: QuantisationConfig
models_dir: Path
imatrix_path: Path | None = None
base_quant: str = "Q4_K_M"
def get_output_path(self) -> Path:
"""Generate output path for quantised model.
Returns:
Path to the output GGUF file.
"""
output_filename = (
f"{self.model_source.original_author}-"
f"{self.model_source.model_name}-"
f"{self.config.name}.gguf"
)
return self.models_dir / self.model_source.model_name / output_filename