330 lines
12 KiB
Python
330 lines
12 KiB
Python
"""Pydantic models for quantisation operations.
|
|
|
|
Contains data models specific to the quantisation workflow including
|
|
quantisation types, configurations, and results. Uses UK English spelling
|
|
conventions throughout (quantisation, not quantization).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from enum import StrEnum
|
|
from pathlib import Path # noqa: TC003
|
|
|
|
from pydantic import BaseModel, ConfigDict, field_validator
|
|
|
|
|
|
class QuantisationType(StrEnum):
|
|
"""Available quantisation types for GGUF model conversion.
|
|
|
|
Comprehensive set of quantisation strategies from Q2 to Q8, including
|
|
K-quants and legacy formats. Each type represents different trade-offs
|
|
between model size, inference speed, and quality preservation. Custom
|
|
variants (L, XL, XXL) enable tensor-specific precision control for
|
|
embeddings, attention layers, and feed-forward networks.
|
|
"""
|
|
|
|
# Q2 variants (smallest, lowest quality)
|
|
Q2_K = "Q2_K"
|
|
Q2_K_S = "Q2_K_S"
|
|
|
|
# Q3 K-quants
|
|
Q3_K_S = "Q3_K_S"
|
|
Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
|
|
Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline)
|
|
Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)
|
|
|
|
# Q4 K-quants (most popular)
|
|
Q4_K_S = "Q4_K_S"
|
|
Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
|
|
Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
|
|
|
|
# Q5 K-quants
|
|
Q5_K_S = "Q5_K_S"
|
|
Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
|
|
Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
|
|
|
|
# Q6_K variants
|
|
Q6_K = "Q6_K"
|
|
Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)
|
|
|
|
# Q8_0 (highest common quantisation)
|
|
Q8_0 = "Q8_0"
|
|
|
|
# Legacy quantisation formats
|
|
Q4_0 = "Q4_0"
|
|
Q4_1 = "Q4_1"
|
|
Q5_0 = "Q5_0"
|
|
Q5_1 = "Q5_1"
|
|
|
|
|
|
class URLType(StrEnum):
|
|
"""Supported URL formats for model source specification.
|
|
|
|
Categorises input URL formats to enable appropriate handling strategies. HuggingFace URLs
|
|
require full model download and conversion, whilst Ollama GGUF URLs allow direct GGUF file
|
|
downloads with pattern matching for efficient processing of pre-quantised models.
|
|
"""
|
|
|
|
HUGGINGFACE = "huggingface"
|
|
OLLAMA_GGUF = "ollama_gguf"
|
|
|
|
|
|
class QuantisationConfig(BaseModel):
|
|
"""Configuration for a specific quantisation method.
|
|
|
|
Defines quantisation parameters for different model variants. The L and XL variants specify a
|
|
base type with optional embedding and output overrides, leveraging the fact that M variants
|
|
already include strategic enhancements to critical layers (embeddings, attention V, and FFN
|
|
down).
|
|
"""
|
|
|
|
model_config = ConfigDict(use_enum_values=True)
|
|
|
|
name: str
|
|
description: str
|
|
base_precision: int # Base precision level (2, 3, 4, 5, 6, 8)
|
|
base_type: str # Base quantisation type for llama-cpp (e.g. "Q3_K_M")
|
|
embedding_type: str | None = None # Override for embeddings (e.g. "Q8_0")
|
|
output_type: str | None = None # Override for output layer (e.g. "Q5_K")
|
|
inherent_enhancements: dict[str, str] | None = None # M variant built-in enhancements
|
|
|
|
def get_layer_config(self, configs_dict: dict | None = None) -> dict[str, str]:
|
|
"""Get layer configuration for display purposes.
|
|
|
|
Returns layer precision specifications based on what the base_type inherently
|
|
does (from inherent_enhancements) plus any L/XL overrides. This is purely
|
|
for documentation and display - the actual quantisation uses base_type with
|
|
tensor-specific overrides applied directly by the quantisation engine.
|
|
|
|
Returns:
|
|
Dictionary mapping layer types to quantisation specifications for display.
|
|
"""
|
|
# Build base quantisation string from precision
|
|
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
|
|
|
|
# Get inherent enhancements for display - inherit from base type if this is L/XL variant
|
|
enhancements = self.inherent_enhancements or {}
|
|
|
|
# If this config has a base_type and no inherent enhancements, inherit for display
|
|
if self.base_type and self.base_type != base and not enhancements and configs_dict:
|
|
# Look up base type by string matching
|
|
for config in configs_dict.values():
|
|
if config.name == self.base_type:
|
|
if config.inherent_enhancements:
|
|
enhancements = config.inherent_enhancements
|
|
break
|
|
|
|
# Start with what the base type inherently does
|
|
embed = enhancements.get("embeddings", base)
|
|
attn_v = enhancements.get("attention_v", base)
|
|
ffn_down = enhancements.get("ffn_down", base)
|
|
output = base # Default output to base
|
|
|
|
# Apply L/XL overrides for display (these take precedence in the display)
|
|
embed = self.embedding_type or embed
|
|
output = self.output_type or output
|
|
|
|
# Build QKV string (Q/K always use base, V may be enhanced by base type)
|
|
qkv = f"{base}/{base}/{attn_v}"
|
|
|
|
return {
|
|
"embed": embed,
|
|
"output": output,
|
|
"qkv": qkv,
|
|
"gate_up": base, # Gate and up always use base quantisation
|
|
"down": ffn_down, # Down uses what base type inherently does
|
|
}
|
|
|
|
def get_compact_config(self, configs_dict: dict | None = None) -> str:
|
|
"""Get compact configuration string with single-letter abbreviations.
|
|
|
|
Creates a compact configuration string using E/O/A/F notation for
|
|
embeddings, output, attention, and FFN layers respectively. This provides
|
|
a concise representation of layer-specific quantisation levels for quick
|
|
comparison and display in user interfaces.
|
|
|
|
Returns:
|
|
Formatted configuration string like "Q6:E Q4:O Q3:A Q5:F".
|
|
"""
|
|
layers = self.get_layer_config(configs_dict)
|
|
|
|
# Parse QKV values
|
|
qkv_parts = layers["qkv"].split("/")
|
|
q_val = qkv_parts[0] if qkv_parts else layers["qkv"]
|
|
k_val = qkv_parts[1] if len(qkv_parts) > 1 else q_val
|
|
v_val = qkv_parts[2] if len(qkv_parts) > 2 else k_val
|
|
|
|
# Special case: uniform quantisation
|
|
if (
|
|
layers["embed"]
|
|
== layers["output"]
|
|
== q_val
|
|
== k_val
|
|
== v_val
|
|
== layers["gate_up"]
|
|
== layers["down"]
|
|
):
|
|
if self.name == "Q6_K":
|
|
return "Q6_K all layers"
|
|
if self.name == "Q8_0":
|
|
return "Q8_0 all layers"
|
|
return f"{layers['embed']} all layers"
|
|
|
|
# Build component groups
|
|
quant_components = defaultdict(list)
|
|
|
|
def add_component(value: str, component: str) -> None:
|
|
if value:
|
|
# Extract precision from quantisation string
|
|
precision = self._extract_precision(value)
|
|
quant_components[f"Q{precision}"].append(component)
|
|
|
|
# Add components
|
|
add_component(layers["embed"], "E")
|
|
add_component(layers["output"], "O")
|
|
|
|
# Attention components
|
|
if q_val == k_val == v_val:
|
|
add_component(q_val, "A")
|
|
else:
|
|
if q_val == k_val:
|
|
add_component(q_val, "Aqk")
|
|
else:
|
|
add_component(q_val, "Aq")
|
|
add_component(k_val, "Ak")
|
|
add_component(v_val, "Av")
|
|
|
|
# FFN components
|
|
if layers["gate_up"] == layers["down"]:
|
|
add_component(layers["gate_up"], "F")
|
|
else:
|
|
add_component(layers["gate_up"], "Fgu")
|
|
add_component(layers["down"], "Fd")
|
|
|
|
# Sort and format
|
|
precision_order = ["Q8", "Q6", "Q5", "Q4", "Q3", "Q2"]
|
|
component_order = ["E", "O", "A", "Av", "Aqk", "Aq", "Ak", "F", "Fd", "Fgu"]
|
|
|
|
sorted_quants = sorted(
|
|
quant_components.items(),
|
|
key=lambda x: precision_order.index(x[0])
|
|
if x[0] in precision_order
|
|
else len(precision_order),
|
|
)
|
|
|
|
components = []
|
|
for quant_level, parts in sorted_quants:
|
|
sorted_parts = sorted(
|
|
parts,
|
|
key=lambda x: component_order.index(x)
|
|
if x in component_order
|
|
else len(component_order),
|
|
)
|
|
components.append(f"{quant_level}:{'/'.join(sorted_parts)}")
|
|
|
|
return " ".join(components)
|
|
|
|
def _extract_precision(self, quant_str: str) -> int:
|
|
"""Extract precision level from quantisation string.
|
|
|
|
Parses quantisation type strings to extract the numerical precision level.
|
|
Handles both K-quant formats (Q3_K, Q4_K_M) and legacy formats (Q8_0, Q5_1)
|
|
by matching the digit following the Q prefix.
|
|
|
|
Returns:
|
|
Precision level as integer, defaulting to 4 if parsing fails.
|
|
"""
|
|
# Extract the digit from strings like "Q3_K", "Q8_0", "Q6_K"
|
|
match = re.search(r"Q(\d+)", quant_str)
|
|
return int(match.group(1)) if match else 4 # Default to 4 if parsing fails
|
|
|
|
|
|
class ModelSource(BaseModel):
|
|
"""Represents a model source with parsed information from URL analysis.
|
|
|
|
Contains comprehensive metadata extracted from model URLs including source
|
|
repository details, author information, and GGUF file patterns. Enables
|
|
differentiation between regular HuggingFace repositories requiring conversion
|
|
and GGUF repositories allowing direct file downloads.
|
|
"""
|
|
|
|
model_config = ConfigDict(use_enum_values=True, protected_namespaces=())
|
|
|
|
url: str
|
|
url_type: URLType
|
|
source_model: str
|
|
original_author: str
|
|
model_name: str
|
|
gguf_file_pattern: str | None = None
|
|
is_gguf_repo: bool = False
|
|
|
|
@field_validator("url")
|
|
@classmethod
|
|
def validate_url(cls, v: str) -> str:
|
|
"""Validate that URL is not empty.
|
|
|
|
Ensures the provided URL string is not empty or None,
|
|
as this is required for model source identification.
|
|
|
|
Returns:
|
|
The validated URL string.
|
|
|
|
Raises:
|
|
ValueError: If URL is empty or None.
|
|
"""
|
|
if not v:
|
|
msg = "URL cannot be empty"
|
|
raise ValueError(msg)
|
|
return v
|
|
|
|
|
|
class QuantisationResult(BaseModel):
|
|
"""Result of a quantisation operation with comprehensive status tracking.
|
|
|
|
Captures the outcome of individual quantisation attempts including success
|
|
status, file paths, sizes, and error details. Supports workflow status
|
|
tracking from planning through processing to completion, enabling real-time
|
|
progress reporting and parallel upload coordination.
|
|
"""
|
|
|
|
model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)
|
|
|
|
quantisation_type: QuantisationType
|
|
success: bool
|
|
file_path: Path | None = None
|
|
file_size: str | None = None
|
|
method_used: str | None = None
|
|
error_message: str | None = None
|
|
status: str = "pending" # planned, processing, uploading, completed, failed
|
|
|
|
|
|
class QuantisationContext(BaseModel):
|
|
"""Context object containing all parameters needed for quantisation execution.
|
|
|
|
Encapsulates quantisation parameters to reduce method argument counts
|
|
and improve code maintainability following parameter object pattern.
|
|
"""
|
|
|
|
model_config = ConfigDict(frozen=True, protected_namespaces=())
|
|
|
|
f16_model_path: Path
|
|
model_source: ModelSource
|
|
config: QuantisationConfig
|
|
models_dir: Path
|
|
imatrix_path: Path | None = None
|
|
base_quant: str = "Q4_K_M"
|
|
|
|
def get_output_path(self) -> Path:
|
|
"""Generate output path for quantised model.
|
|
|
|
Returns:
|
|
Path to the output GGUF file.
|
|
"""
|
|
output_filename = (
|
|
f"{self.model_source.original_author}-"
|
|
f"{self.model_source.model_name}-"
|
|
f"{self.config.name}.gguf"
|
|
)
|
|
return self.models_dir / self.model_source.model_name / output_filename
|