Compare commits

...
Sign in to create a new pull request.

3 commits

Author SHA256 Message Date
21d8c03aea Refactor modules 2025-08-09 17:16:02 +01:00
de6b853175 Support GGML quants 2025-08-09 12:58:58 +01:00
633efdc305 Use proper binaries 2025-08-09 10:55:42 +01:00
47 changed files with 6335 additions and 3082 deletions

1
.gitignore vendored
View file

@ -58,3 +58,4 @@ venv.bak/
# Working directories
work/
quantisation_work/
.cache/

View file

@ -11,6 +11,19 @@ from __future__ import annotations
from helpers.models.quantisation import QuantisationConfig, QuantisationType
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
# Basic quantisation profiles
QuantisationType.Q2_0: QuantisationConfig(
name="Q2_0",
description="Basic Q2_0 quantisation (2-bit, smallest)",
base_precision=2,
base_type="Q2_0",
),
QuantisationType.Q3_0: QuantisationConfig(
name="Q3_0",
description="Basic Q3_0 quantisation (3-bit)",
base_precision=3,
base_type="Q3_0",
),
# Standard quantisation profiles
QuantisationType.Q2_K: QuantisationConfig(
name="Q2_K",
@ -46,15 +59,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
base_type="Q3_K_M",
base_precision=3,
output_type="Q5_K",
output_type="q5_k",
),
QuantisationType.Q3_K_XL: QuantisationConfig(
name="Q3_K_XL",
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
base_type="Q3_K_M",
base_precision=3,
embedding_type="Q8_0",
output_type="Q6_K",
embedding_type="q8_0",
output_type="q6_k",
),
QuantisationType.Q4_K_S: QuantisationConfig(
name="Q4_K_S",
@ -78,7 +91,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
base_type="Q4_K_M",
base_precision=4,
embedding_type="Q8_0",
embedding_type="q8_0",
),
# Additional standard quantisation profiles
QuantisationType.Q5_K_S: QuantisationConfig(
@ -103,7 +116,13 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
base_type="Q5_K_M",
base_precision=5,
embedding_type="Q8_0",
embedding_type="q8_0",
),
QuantisationType.Q6_0: QuantisationConfig(
name="Q6_0",
description="Basic Q6_0 quantisation (6-bit)",
base_precision=6,
base_type="Q6_0",
),
QuantisationType.Q6_K: QuantisationConfig(
name="Q6_K",
@ -121,11 +140,17 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
base_type="Q6_K",
base_precision=6,
output_type="Q8_0",
output_type="q8_0",
),
QuantisationType.Q8_K: QuantisationConfig(
name="Q8_K",
description="Q8_K quantisation (highest quality, largest size)",
base_precision=8,
base_type="Q8_K",
),
QuantisationType.Q8_0: QuantisationConfig(
name="Q8_0",
description="Q8_0 quantisation (highest quality, largest size)",
description="Basic Q8_0 quantisation (8-bit flat)",
base_precision=8,
base_type="Q8_0",
),
@ -157,46 +182,57 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
}
# Default profile set for optimal quality/size balance
DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
# Q3 variants (smallest)
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 variants
QuantisationType.Q4_0, # Basic - always available
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 variants
QuantisationType.Q5_0, # Basic - always available
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6 variants
QuantisationType.Q6_0, # Basic - always available
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
QuantisationType.Q8_0,
# Q8 variants (largest)
QuantisationType.Q8_0, # Basic - always available
QuantisationType.Q8_K,
]
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
# Q2 variants
QuantisationType.Q2_0,
QuantisationType.Q2_K,
QuantisationType.Q2_K_S,
# Q3 K-quants
QuantisationType.Q3_0,
QuantisationType.Q3_K_S,
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 K-quants
QuantisationType.Q4_0,
QuantisationType.Q4_1,
QuantisationType.Q4_K_S,
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 K-quants
QuantisationType.Q5_0,
QuantisationType.Q5_1,
QuantisationType.Q5_K_S,
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6_K
QuantisationType.Q6_0,
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
# Q8_0
QuantisationType.Q8_0,
# Legacy formats
QuantisationType.Q4_0,
QuantisationType.Q4_1,
QuantisationType.Q5_0,
QuantisationType.Q5_1,
QuantisationType.Q8_K,
]

View file

@ -0,0 +1,17 @@
"""Filesystem operations and management.
Provides utilities for file cleanup, workspace management, and
directory operations throughout the quantisation workflow.
"""
from __future__ import annotations
from helpers.filesystem.cleanup import FileCleanup
from helpers.filesystem.operations import FilesystemService
from helpers.filesystem.workspace import WorkspaceManager
__all__ = [
"FileCleanup",
"FilesystemService",
"WorkspaceManager",
]

View file

@ -0,0 +1,81 @@
"""File cleanup operations for the quantisation workflow.
Manages removal of temporary files, model cleanup after processing,
and disk space recovery during quantisation operations.
"""
from __future__ import annotations
from shutil import rmtree as shutil_rmtree
from typing import TYPE_CHECKING
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import ModelSource
class FileCleanup:
"""Handles cleanup of temporary and intermediate files.
Provides methods for removing processed model files, temporary
conversions, and other artifacts to manage disk space efficiently
during quantisation workflows.
"""
@staticmethod
def cleanup_files(f16_model_path: Path, model_source: ModelSource, models_dir: Path) -> None:
"""Clean up temporary files after processing.
Removes F16 model and original format files to save disk space
after successful quantisation and upload. Processes both F16
GGUF files and original model formats to maximise storage recovery.
"""
if f16_model_path.exists():
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
f16_model_path.unlink()
if not model_source.is_gguf_repo:
FileCleanup.cleanup_original_model(model_source, models_dir)
@staticmethod
def cleanup_original_model(model_source: ModelSource, models_dir: Path) -> None:
"""Clean up original model files after successful conversion.
Removes SafeTensors files to save disk space whilst preserving
configuration, tokeniser, and metadata files for reference. The
design prioritises space efficiency over re-conversion capability.
"""
model_dir = models_dir / model_source.model_name
safetensor_files = list(model_dir.glob("*.safetensors"))
if safetensor_files:
logger.info(f"Removing {len(safetensor_files)} SafeTensors files...")
for file in safetensor_files:
file.unlink()
logger.info("Keeping config files, tokeniser, and metadata for reference")
@staticmethod
def cleanup_quantisation_file(file_path: Path) -> None:
"""Remove a single quantisation file.
Safely removes the specified file with existence checking and
logging for disk space management during quantisation workflows.
"""
if file_path.exists():
logger.info(f"Removing {file_path.name} to save disk space...")
file_path.unlink()
@staticmethod
def cleanup_temp_directory(temp_dir: Path) -> None:
"""Clean up a temporary directory and all its contents.
Recursively removes the directory and all subdirectories with
error tolerance to handle locked or missing files gracefully.
"""
if temp_dir.exists() and temp_dir.is_dir():
logger.debug(f"Cleaning up temporary directory: {temp_dir}")
shutil_rmtree(temp_dir, ignore_errors=True)

View file

@ -1,8 +1,7 @@
"""Filesystem operations service.
"""Core filesystem operations.
Provides unified filesystem operations including file discovery, size
calculation, and path management. Consolidates common filesystem patterns
used across quantisation and conversion workflows.
calculation, and path management for quantisation workflows.
"""
from __future__ import annotations
@ -21,8 +20,7 @@ class FilesystemService:
"""Handles filesystem operations with consistent error handling.
Provides methods for file discovery, size formatting, and JSON loading
with proper error handling and logging. Ensures consistent behaviour
across different tools and workflows.
with proper error handling and logging.
"""
@staticmethod
@ -31,10 +29,10 @@ class FilesystemService:
Attempts to use `du -h` for human-readable output, falling back to
Python calculation if the system command fails. Provides consistent
size formatting across the toolset.
formatting across different platforms and file sizes.
Returns:
Human-readable file size string (e.g., "1.5G", "750M").
Human-readable file size string (e.g. "1.5G", "750M").
"""
try:
result = subprocess.run(
@ -43,7 +41,6 @@ class FilesystemService:
return result.stdout.split()[0]
except (subprocess.CalledProcessError, FileNotFoundError):
# Fallback to Python calculation
try:
size_bytes: float = float(file_path.stat().st_size)
for unit in ["B", "K", "M", "G", "T"]:
@ -60,8 +57,7 @@ class FilesystemService:
"""Load and parse JSON configuration file.
Provides consistent JSON loading with proper error handling and
encoding specification. Used for loading model configurations,
tokeniser settings, and other JSON-based metadata.
UTF-8 encoding specification for cross-platform compatibility.
Returns:
Parsed JSON content as dictionary.
@ -81,9 +77,8 @@ class FilesystemService:
"""Find all SafeTensor files in model directory using priority search.
Searches for tensor files in order of preference: single model.safetensors,
sharded model-*-of-*.safetensors files, then any *.safetensors files. This
approach handles both single-file and multi-shard model distributions whilst
ensuring predictable file ordering for conversion consistency.
sharded model-*-of-*.safetensors files, then any *.safetensors files.
The prioritisation ensures optimal handling of different model formats.
Returns:
List of SafeTensor file paths in priority order.
@ -116,7 +111,7 @@ class FilesystemService:
Searches for GGUF files with optional pattern matching. Prioritises
multi-part files (00001-of-*) over single files for proper handling
of large models split across multiple files.
of sharded model architectures.
Returns:
List of GGUF file paths, sorted with multi-part files first.
@ -140,8 +135,8 @@ class FilesystemService:
def ensure_directory(path: Path) -> Path:
"""Ensure directory exists, creating if necessary.
Creates directory and all parent directories if they don't exist.
Returns the path for method chaining convenience.
Creates directory and all parent directories if they don't exist,
using atomic operations to handle concurrent access gracefully.
Returns:
The directory path.
@ -153,8 +148,8 @@ class FilesystemService:
def cleanup_directory(path: Path, pattern: str = "*") -> int:
"""Remove files matching pattern from directory.
Safely removes files matching the specified glob pattern. Returns
count of files removed for logging purposes.
Safely removes files matching the specified glob pattern with
comprehensive error handling to prevent workflow interruption.
Returns:
Number of files removed.

View file

@ -0,0 +1,146 @@
"""Workspace management for quantisation operations.
Manages working directories, model storage paths, and temporary
file locations throughout the quantisation workflow.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from shutil import disk_usage as shutil_disk_usage, rmtree as shutil_rmtree
from helpers.logger import logger
class WorkspaceManager:
"""Manages workspace directories and paths.
Provides centralised management of working directories, model
storage, and temporary file locations with automatic directory
creation and validation.
"""
def __init__(self, work_dir: Path | None = None) -> None:
"""Initialise workspace manager.
Sets up base working directory structure with models and temporary
file directories. Defaults to quantisation_work in current directory
if no path is specified.
"""
self.work_dir = work_dir or Path.cwd() / "quantisation_work"
self.models_dir = self.work_dir / "models"
self._setup_directories()
def _setup_directories(self) -> None:
"""Create necessary workspace directories."""
self.work_dir.mkdir(parents=True, exist_ok=True)
self.models_dir.mkdir(parents=True, exist_ok=True)
logger.debug(f"Workspace initialised at: {self.work_dir}")
def get_model_dir(self, model_name: str) -> Path:
"""Get directory path for a specific model.
Creates the model directory if it doesn't exist and returns the path
for storing model files and quantisation outputs.
Returns:
Path to model directory.
"""
model_dir = self.models_dir / model_name
model_dir.mkdir(parents=True, exist_ok=True)
return model_dir
def get_temp_dir(self, prefix: str = "temp") -> Path:
"""Get a temporary directory path within workspace.
Creates a unique temporary directory with specified prefix within
the workspace for intermediate processing files.
Returns:
Path to temporary directory.
"""
return Path(tempfile.mkdtemp(prefix=f"{prefix}_", dir=self.work_dir))
def get_imatrix_dir(self, model_name: str) -> Path:
"""Get directory for importance matrix files.
Creates and returns the path to the imatrix directory for storing
importance matrices used in advanced quantisation methods.
Returns:
Path to imatrix directory.
"""
imatrix_dir = self.models_dir / model_name / "imatrix"
imatrix_dir.mkdir(parents=True, exist_ok=True)
return imatrix_dir
def get_quantisation_output_path(
self,
model_name: str,
author: str,
quant_type: str,
) -> Path:
"""Get output path for a quantised model.
Constructs standardised filename and path for quantised model output
using author-model-quantisation format for consistent naming.
Returns:
Path for quantised model output.
"""
model_dir = self.get_model_dir(model_name)
filename = f"{author}-{model_name}-{quant_type}.gguf"
return model_dir / filename
def cleanup_workspace(self) -> None:
"""Clean up entire workspace directory."""
if self.work_dir.exists():
logger.info(f"Cleaning up workspace: {self.work_dir}")
shutil_rmtree(self.work_dir, ignore_errors=True)
@property
def disk_usage(self) -> dict[str, float]:
"""Get disk usage statistics for workspace.
Returns:
Dictionary with size in GB for work_dir and models_dir.
"""
def get_dir_size(path: Path) -> float:
"""Calculate total size of directory in GB.
Recursively traverses directory tree to calculate total file
sizes with GB conversion for human-readable output.
Returns:
Total size of directory in GB.
"""
total = 0
if path.exists():
for item in path.rglob("*"):
if item.is_file():
total += item.stat().st_size
return total / (1024**3) # Convert to GB
return {
"work_dir": get_dir_size(self.work_dir),
"models_dir": get_dir_size(self.models_dir),
}
def validate_space(self, required_gb: float = 50.0) -> bool:
"""Check if sufficient disk space is available.
Validates available disk space against required threshold, logging
warnings when space is insufficient for quantisation operations.
Returns:
True if sufficient space available.
"""
stat = shutil_disk_usage(self.work_dir)
free_gb = stat.free / (1024**3)
if free_gb < required_gb:
logger.warning(f"Low disk space: {free_gb:.1f}GB free, {required_gb:.1f}GB recommended")
return False
return True

11
helpers/ggml/__init__.py Normal file
View file

@ -0,0 +1,11 @@
"""GGML quantisation operations.
Provides numpy-based GGML block quantisation for architectures
not supported by llama.cpp.
"""
from __future__ import annotations
from helpers.ggml.quantiser import GGMLQuantiser
__all__ = ["GGMLQuantiser"]

574
helpers/ggml/quantiser.py Normal file
View file

@ -0,0 +1,574 @@
"""GGML block quantisation for unsupported architectures.
Implements proper GGML quantisation formats (Q4_0, Q5_0, Q8_0) using numpy,
following the exact specifications from ggml. This allows quantisation of
models with architectures not yet supported by llama.cpp.
"""
from __future__ import annotations
import struct
import traceback
from typing import TYPE_CHECKING, Any
import gguf
import numpy as np
from helpers.filesystem import FilesystemService
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
# GGML block sizes for different quantisation types
QK4_0 = 32 # Block size for Q4_0
QK5_0 = 32 # Block size for Q5_0
QK5_1 = 32 # Block size for Q5_1
QK8_0 = 32 # Block size for Q8_0
class GGMLQuantiser:
"""Implements GGML quantisation formats for architecture-agnostic models.
Provides proper GGML block quantisation using numpy, following the exact
format specifications. This enables Q4_0, Q5_0, and Q8_0 quantisation
for models with unsupported architectures.
"""
def __init__(self) -> None:
"""Initialise GGML quantiser."""
self.fs = FilesystemService()
def get_supported_types(self) -> list[str]:
"""Get supported basic quantisation types.
Returns:
List of supported quantisation type strings.
"""
return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
def _extract_architecture_string(self, arch_field: Any) -> str:
"""Extract architecture string from GGUF field data.
Handles various formats of architecture field storage in GGUF files.
Returns:
Architecture string or 'unknown' if extraction fails.
"""
if not arch_field:
return "unknown"
if hasattr(arch_field, "parts") and arch_field.parts:
return self._extract_from_parts_array(arch_field)
if hasattr(arch_field, "data"):
return self._extract_from_data_field(arch_field.data)
return "unknown"
def _extract_from_parts_array(self, arch_field: Any) -> str:
"""Extract architecture from GGUF parts array format.
Returns:
Architecture string or 'unknown' if extraction fails.
"""
if len(arch_field.data) == 0:
return "unknown"
idx = arch_field.data[0] if isinstance(arch_field.data, (list, tuple)) else arch_field.data
if idx >= len(arch_field.parts):
return "unknown"
return self._decode_part(arch_field.parts[idx])
def _decode_part(self, arch_part: Any) -> str:
"""Decode architecture part to string.
Returns:
Decoded string representation.
"""
if isinstance(arch_part, bytes):
return arch_part.decode("utf-8")
if isinstance(arch_part, str):
return arch_part
if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
# Handle nested format
if isinstance(arch_part[0], bytes):
return arch_part[0].decode("utf-8")
return str(arch_part[0])
return str(arch_part)
def _extract_from_data_field(self, data: Any) -> str:
"""Extract architecture from GGUF data field.
Returns:
Architecture string or 'unknown' if extraction fails.
"""
if isinstance(data, np.ndarray):
# It's a numpy array of bytes - convert to string
try:
return bytes(data).decode("utf-8")
except (UnicodeDecodeError, ValueError):
# If that fails, try converting as ASCII values
return "".join(chr(c) for c in data if c < 128)
elif isinstance(data, bytes):
return data.decode("utf-8")
elif isinstance(data, str):
return data
else:
return str(data)
def _copy_metadata_fields(self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter) -> None:
"""Copy metadata fields from reader to writer, excluding file type."""
logger.info("📋 Copying metadata...")
for key, field in reader.fields.items():
# Skip the file type field - we'll set our own
if key == "general.file_type":
continue
# Handle different field types
if field.types:
field_type = field.types[0]
field_data = field.parts[field.data[0]] if field.parts else field.data
self._copy_field_by_type(writer, key, field_type, field_data, field)
def _copy_field_by_type(
self,
writer: gguf.GGUFWriter,
key: str,
field_type: gguf.GGUFValueType,
field_data: Any,
field: Any,
) -> None:
"""Copy a single field based on its type."""
if field_type == gguf.GGUFValueType.STRING:
# Handle both bytes and string types
string_val = field_data[0]
if isinstance(string_val, bytes):
string_val = string_val.decode("utf-8")
elif isinstance(string_val, int):
string_val = str(string_val)
writer.add_string(key, string_val)
elif field_type == gguf.GGUFValueType.UINT32:
writer.add_uint32(key, int(field.data[0]))
elif field_type == gguf.GGUFValueType.FLOAT32:
writer.add_float32(key, float(field.data[0]))
elif field_type == gguf.GGUFValueType.BOOL:
writer.add_bool(key, bool(field.data[0]))
elif field_type == gguf.GGUFValueType.ARRAY:
writer.add_array(key, field.data)
else:
# Skip unsupported field types for now
# Future enhancement: Handle additional GGUF field types as needed
pass
def _get_file_type_mapping(self) -> dict[str, gguf.GGMLQuantizationType]:
"""Get mapping from quantisation type strings to GGML enums.
Returns:
Mapping from quantisation type strings to GGML enums.
"""
return {
"Q4_0": gguf.GGMLQuantizationType.Q4_0,
"Q5_0": gguf.GGMLQuantizationType.Q5_0,
"Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum
"Q8_0": gguf.GGMLQuantizationType.Q8_0,
}
def _process_tensor_list(
self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter, quant_type: str
) -> None:
"""Process all tensors for quantisation."""
logger.info(f"🔄 Quantising {len(reader.tensors)} tensors to {quant_type}...")
for i, tensor in enumerate(reader.tensors):
if i % 50 == 0:
logger.info(f" Processing tensor {i}/{len(reader.tensors)}...")
self._process_single_tensor(tensor, writer, quant_type)
def _process_single_tensor(self, tensor: Any, writer: gguf.GGUFWriter, quant_type: str) -> None:
"""Process a single tensor for quantisation or preserve as-is."""
# Get tensor info
name = tensor.name
shape = list(tensor.shape)
data = tensor.data
# Determine if this tensor should be quantised
should_quantise = self._should_quantise_tensor(name)
if not should_quantise:
# Keep original format
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
else:
# Quantise the tensor
try:
quantised_data, quant_dtype = self._quantise_tensor(
data, tensor.tensor_type, shape, quant_type
)
writer.add_tensor(name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype)
except ValueError as e:
# If quantization fails due to shape issues, keep original
logger.warning(f" ⚠️ Cannot quantise {name}: {e}")
logger.warning(" Keeping in original format")
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
def _write_output_file(self, writer: gguf.GGUFWriter, output_path: Path) -> bool:
"""Write the final GGUF file and verify creation.
Returns:
True if successful, False otherwise
"""
logger.info(f"💾 Writing {output_path.name}...")
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_tensors_to_file()
writer.close()
if output_path.exists():
file_size = self.fs.get_file_size(output_path)
logger.info(f"✅ GGML quantisation complete: {file_size}")
return True
logger.error("❌ Output file was not created")
return False
def quantise_basic(
self,
input_path: Path,
output_path: Path,
quant_type: str,
) -> bool:
"""Perform GGML block quantisation on a GGUF file.
Reads a GGUF file, quantises all tensors using the specified
quantisation type, and writes a new GGUF file. Implements proper
GGML block formats for architecture-agnostic quantisation.
Returns:
True if successful, False otherwise
"""
if quant_type not in self.get_supported_types():
logger.error(f"Unsupported quantisation type: {quant_type}")
return False
logger.info(f"🔧 Starting GGML {quant_type} quantisation...")
logger.info("📝 This uses numpy-based block quantisation")
try:
# Read input GGUF
logger.info(f"📖 Reading {input_path.name}...")
reader = gguf.GGUFReader(str(input_path))
# Create output writer with same architecture
arch_field = reader.fields.get("general.architecture")
arch_str = self._extract_architecture_string(arch_field)
logger.info(f"📝 Architecture: {arch_str}")
writer = gguf.GGUFWriter(str(output_path), arch_str)
# Copy all metadata
self._copy_metadata_fields(reader, writer)
# Set file type based on quantisation
file_type_map = self._get_file_type_mapping()
writer.add_file_type(file_type_map[quant_type])
# Process tensors
self._process_tensor_list(reader, writer, quant_type)
# Write the output file
return self._write_output_file(writer, output_path)
except Exception as e:
logger.error(f"❌ GGML quantisation failed: {e}\n{traceback.format_exc()}")
return False
def _should_quantise_tensor(self, tensor_name: str) -> bool:
"""Determine if a tensor should be quantised.
Some tensors like token embeddings should typically remain in
higher precision for quality.
Returns:
True if the tensor should be quantised, False otherwise
"""
# Keep token embeddings and output layers in original precision
# These patterns cover most architectures
keep_original = [
"token_embd",
"output.weight",
"lm_head",
"embed_tokens",
"word_embeddings",
]
for pattern in keep_original:
if pattern in tensor_name:
logger.debug(f" Keeping {tensor_name} in original format")
return False
return True
def _quantise_tensor(
self,
data: np.ndarray,
dtype: gguf.GGMLQuantizationType,
shape: list[int],
quant_type: str,
) -> tuple[np.ndarray, gguf.GGMLQuantizationType]:
"""Quantise a tensor using GGML block quantisation.
Returns:
Tuple of (quantised_data, new_dtype)
"""
# Work directly with numpy array - convert to float32 if needed
if dtype in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}:
arr = data.astype(np.float32)
else:
# Already quantised or unknown type - return as-is
return data, dtype
# Reshape to original shape
arr = arr.reshape(shape)
# Flatten for processing
arr_flat = arr.flatten()
# Apply quantisation
if quant_type == "Q8_0":
quantised = self._quantise_q8_0(arr_flat)
new_dtype = gguf.GGMLQuantizationType.Q8_0
elif quant_type == "Q6_0":
quantised = self._quantise_q6_0(arr_flat)
new_dtype = gguf.GGMLQuantizationType.Q6_K # Q6_0 uses Q6_K enum
elif quant_type == "Q5_0":
quantised = self._quantise_q5_0(arr_flat)
new_dtype = gguf.GGMLQuantizationType.Q5_0
elif quant_type == "Q4_0":
quantised = self._quantise_q4_0(arr_flat)
new_dtype = gguf.GGMLQuantizationType.Q4_0
else:
# Unsupported - return original
return data, dtype
# Convert bytes back to numpy array for gguf writer
return np.frombuffer(quantised, dtype=np.uint8), new_dtype
def _quantise_q8_0(self, arr: np.ndarray) -> bytes:
"""Quantise to Q8_0 format.
Q8_0: Blocks of 32 values, each block has:
- 1 float16 scale factor (2 bytes)
- 32 int8 values (32 bytes)
Total: 34 bytes per 32 values
Returns:
Bytes of the quantised data
"""
n = len(arr)
nb = (n + QK8_0 - 1) // QK8_0 # Number of blocks
output = bytearray()
for i in range(nb):
# Get block of values
start = i * QK8_0
end = min(start + QK8_0, n)
block = arr[start:end]
# Pad if needed
if len(block) < QK8_0:
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
# Calculate scale
amax = np.abs(block).max()
scale = amax / 127.0 if amax > 0 else 1.0
# Quantise
quantised = np.round(block / scale).astype(np.int8)
quantised = np.clip(quantised, -128, 127)
output.extend(struct.pack("e", scale)) # 'e' is float16
output.extend(quantised.tobytes())
return bytes(output)
def _quantise_q6_0(self, arr: np.ndarray) -> bytes:
"""Quantise to Q6_0 format.
Q6_0: Blocks of 32 values with 6-bit quantisation
- 1 float16 scale (2 bytes)
- 1 float16 min value (2 bytes)
- 24 bytes of packed 6-bit values (32 values * 6 bits = 192 bits = 24 bytes)
Total: 28 bytes per 32 values
Returns:
Bytes of the quantised data
"""
n = len(arr)
nb = (n + QK8_0 - 1) // QK8_0 # Use same block size as Q8_0
output = bytearray()
for i in range(nb):
# Get block
start = i * QK8_0
end = min(start + QK8_0, n)
block = arr[start:end]
# Pad if needed
if len(block) < QK8_0:
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
# Calculate scale and min
vmin = block.min()
vmax = block.max()
scale = (vmax - vmin) / 63.0 if vmax > vmin else 1.0
# Quantise to 6-bit (0-63)
quantised = np.round((block - vmin) / scale).astype(np.uint8)
quantised = np.clip(quantised, 0, 63)
# Pack scale and min
output.extend(struct.pack("e", scale))
output.extend(struct.pack("e", vmin))
# Pack 6-bit values (simplified - using 1 byte per value)
# Proper implementation would pack 4 values into 3 bytes
for q in quantised:
output.append(q)
# Pad to expected size
while len(output) % 28 != 0:
output.append(0)
return bytes(output)
def _quantise_q5_0(self, arr: np.ndarray) -> bytes:
"""Quantise to Q5_0 format.
Q5_0: Blocks of 32 values with 5-bit quantisation
- 1 float16 scale (2 bytes)
- 1 float16 min value (2 bytes)
- 20 bytes of packed 5-bit values (32 values * 5 bits = 160 bits = 20 bytes)
Total: 24 bytes per 32 values
Returns:
Bytes of the quantised data
"""
n = len(arr)
nb = (n + QK5_0 - 1) // QK5_0
output = bytearray()
for i in range(nb):
# Get block
start = i * QK5_0
end = min(start + QK5_0, n)
block = arr[start:end]
# Pad if needed
if len(block) < QK5_0:
block = np.pad(block, (0, QK5_0 - len(block)), mode="constant")
# Calculate scale and min
vmin = block.min()
vmax = block.max()
scale = (vmax - vmin) / 31.0 if vmax > vmin else 1.0
# Quantise to 5-bit (0-31)
quantised = np.round((block - vmin) / scale).astype(np.uint8)
quantised = np.clip(quantised, 0, 31)
# Pack scale and min
output.extend(struct.pack("e", scale))
output.extend(struct.pack("e", vmin))
# Pack 5-bit values (simplified packing - not optimal but functional)
# For simplicity, use 1 byte per value (wasting 3 bits each)
# Proper implementation would pack 8 values into 5 bytes
for q in quantised:
output.append(q)
# Pad to expected size
while len(output) % 24 != 0:
output.append(0)
return bytes(output)
def _quantise_q4_0(self, arr: np.ndarray) -> bytes:
"""Quantise to Q4_0 format.
Q4_0: Blocks of 32 values with 4-bit quantisation
- 1 float16 scale (2 bytes)
- 1 float16 min value (2 bytes)
- 16 bytes of packed 4-bit values (32 values * 4 bits = 128 bits = 16 bytes)
Total: 20 bytes per 32 values
Returns:
Bytes of the quantised data
"""
n = len(arr)
nb = (n + QK4_0 - 1) // QK4_0
output = bytearray()
for i in range(nb):
# Get block
start = i * QK4_0
end = min(start + QK4_0, n)
block = arr[start:end]
# Pad if needed
if len(block) < QK4_0:
block = np.pad(block, (0, QK4_0 - len(block)), mode="constant")
# Calculate scale and min
vmin = block.min()
vmax = block.max()
scale = (vmax - vmin) / 15.0 if vmax > vmin else 1.0
# Quantise to 4-bit (0-15)
quantised = np.round((block - vmin) / scale).astype(np.uint8)
quantised = np.clip(quantised, 0, 15)
# Pack scale and min
output.extend(struct.pack("e", scale))
output.extend(struct.pack("e", vmin))
# Pack 4-bit values - 2 values per byte
for j in range(0, 32, 2):
packed = (quantised[j] & 0xF) | ((quantised[j + 1] & 0xF) << 4)
output.append(packed)
return bytes(output)
def try_alternative_quantisation(
self,
input_path: Path,
output_path: Path,
target_type: str,
) -> bool:
"""Try basic quantisation for unsupported architectures.
For architectures not supported by llama.cpp, uses GGML implementation
to provide basic quantisation formats as fallback. Handles only basic
types that can be generated with numpy-based GGML quantisation.
Returns:
True if successful, False otherwise
"""
# Only handle basic types that we can generate with GGML
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
if target_type in basic_types:
logger.info(f"📝 Using GGML numpy implementation for {target_type}")
return self.quantise_basic(input_path, output_path, target_type)
# For K-quants on unsupported architectures, we can't provide a direct equivalent
logger.error(f"❌ Cannot quantise {target_type} for unsupported architecture")
logger.info("💡 Consider using Q4_0, Q5_0, Q6_0, or Q8_0 instead")
return False

12
helpers/gguf/__init__.py Normal file
View file

@ -0,0 +1,12 @@
"""GGUF file operations.
Provides reading, writing, and conversion utilities for GGUF format files.
"""
from __future__ import annotations
from helpers.gguf.converter import GGUFConverter
from helpers.gguf.reader import GGUFReader
from helpers.gguf.writer import GGUFWriter
__all__ = ["GGUFConverter", "GGUFReader", "GGUFWriter"]

216
helpers/gguf/converter.py Normal file
View file

@ -0,0 +1,216 @@
"""SafeTensors to GGUF conversion.
Handles conversion of SafeTensors models to GGUF format with proper
metadata and tensor mapping.
"""
from __future__ import annotations
import gc
import json
import traceback
from pathlib import Path
from typing import TYPE_CHECKING, Any
import torch
from safetensors import safe_open
from helpers.filesystem import FilesystemService
from helpers.gguf.writer import GGUFWriter
from helpers.logger import logger
if TYPE_CHECKING:
from helpers.models.conversion import ModelConfig
from helpers.utils.tensor_mapping import TensorMapper
class GGUFConverter:
"""High-level GGUF conversion orchestrator.
Coordinates the complete conversion workflow from source models to GGUF
format, managing metadata extraction, tensor mapping, and file writing.
"""
@staticmethod
def convert_safetensors(
model_path: Path,
output_path: Path,
model_config: ModelConfig,
architecture: str,
tensor_mapper: TensorMapper,
) -> bool:
"""Convert SafeTensors model to GGUF format.
Orchestrates the conversion process including metadata setup, tensor
loading with BFloat16 support, name mapping, and tokeniser integration.
Returns:
True if conversion successful, False otherwise.
"""
logger.info(f"Converting {model_path.name} to GGUF...")
# Create writer
writer_wrapper = GGUFWriter(output_path, architecture)
# Add metadata
writer_wrapper.add_metadata(model_config, model_path.name)
# Add vision metadata if present
if model_config.vision_config:
writer_wrapper.add_vision_metadata(model_config.vision_config)
# Load and add tensors
fs = FilesystemService()
tensor_files = fs.find_safetensor_files(model_path)
logger.info(f"Found {len(tensor_files)} tensor file(s)")
tensor_count = 0
for tensor_file in tensor_files:
logger.info(f"Loading {tensor_file.name}...")
with safe_open(tensor_file, framework="pt") as f:
for tensor_name in f.keys(): # noqa: SIM118
tensor_data = f.get_tensor(tensor_name)
# Convert BFloat16 to Float32
if hasattr(tensor_data, "numpy"):
if torch and tensor_data.dtype == torch.bfloat16:
tensor_data = tensor_data.float()
numpy_data = tensor_data.numpy()
else:
# Already numpy
numpy_data = tensor_data
# Map tensor name
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
if not gguf_name:
logger.debug(f"Skipping unmapped tensor: {tensor_name}")
continue
logger.debug(f" {tensor_name} -> {gguf_name}")
writer_wrapper.add_tensor(gguf_name, numpy_data)
tensor_count += 1
# Clean up memory after each file
gc.collect()
if torch and torch.cuda.is_available():
torch.cuda.empty_cache()
logger.info(f"Added {tensor_count} tensors")
# Add tokeniser
tokeniser_config = GGUFConverter.load_tokeniser_config(model_path)
if tokeniser_config:
writer_wrapper.add_tokeniser(tokeniser_config)
writer_wrapper.add_tokeniser_vocabulary(model_path)
# Finalise and write
writer_wrapper.write()
# Clean up
del writer_wrapper
gc.collect()
return output_path.exists()
@staticmethod
def convert_pytorch(
model_path: Path,
output_path: Path,
model_config: ModelConfig,
architecture: str,
tensor_mapper: TensorMapper,
) -> bool:
"""Convert PyTorch model to GGUF format.
Handles PyTorch bin file conversion with sharded model support,
BFloat16 compatibility, and proper memory management.
Returns:
True if conversion successful, False otherwise.
"""
logger.info(f"Converting {model_path.name} to GGUF...")
# Create writer
writer_wrapper = GGUFWriter(output_path, architecture)
# Add metadata
writer_wrapper.add_metadata(model_config, model_path.name)
# Load and add tensors
fs = FilesystemService()
model_files = fs.find_safetensor_files(model_path)
logger.info(f"Found {len(model_files)} model file(s)")
tensor_count = 0
for model_file in model_files:
logger.info(f"Loading {model_file.name}...")
try:
checkpoint = torch.load(model_file, map_location="cpu", weights_only=True)
for tensor_name, tensor_data in checkpoint.items():
# Convert to numpy
if hasattr(tensor_data, "numpy"):
if tensor_data.dtype == torch.bfloat16:
converted_tensor = tensor_data.float()
else:
converted_tensor = tensor_data
numpy_data = converted_tensor.numpy()
else:
numpy_data = tensor_data
# Map tensor name
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
if not gguf_name:
logger.debug(f"Skipping unmapped tensor: {tensor_name}")
continue
logger.debug(f" {tensor_name} -> {gguf_name}")
writer_wrapper.add_tensor(gguf_name, numpy_data)
tensor_count += 1
# Clean up checkpoint
del checkpoint
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
logger.error(f"Failed to load {model_file.name}: {e}")
logger.error(traceback.format_exc())
return False
logger.info(f"Added {tensor_count} tensors")
# Add tokeniser
tokeniser_config = GGUFConverter.load_tokeniser_config(model_path)
if tokeniser_config:
writer_wrapper.add_tokeniser(tokeniser_config)
writer_wrapper.add_tokeniser_vocabulary(model_path)
# Finalise and write
writer_wrapper.write()
# Clean up
del writer_wrapper
gc.collect()
return output_path.exists()
@staticmethod
def load_tokeniser_config(model_path: Path) -> dict[str, Any] | None:
"""Load tokeniser configuration from model directory.
Returns:
Tokeniser configuration dictionary or None if not found.
"""
config_path = model_path / "tokenizer_config.json"
if not config_path.exists():
logger.warning("tokenizer_config.json not found")
return None
try:
with Path(config_path).open(encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load tokeniser config: {e}")
return None

231
helpers/gguf/reader.py Normal file
View file

@ -0,0 +1,231 @@
"""GGUF file reading operations.
Provides utilities for reading and extracting information from GGUF files.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any
import gguf
import numpy as np
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
class GGUFReader:
"""Reads and extracts information from GGUF files.
Provides methods to read metadata, architecture information, and tensors
from existing GGUF files for inspection or re-quantisation.
"""
def __init__(self, file_path: Path) -> None:
"""Initialise GGUF reader with file path.
Sets up the internal GGUF reader instance for subsequent metadata
and tensor extraction operations on the specified file.
"""
self.file_path = file_path
self.reader = gguf.GGUFReader(str(file_path))
def get_architecture(self) -> str:
"""Extract architecture string from GGUF file.
Returns:
Architecture string or "unknown" if not found.
"""
arch = self.reader.fields.get("general.architecture")
if not arch:
return "unknown"
# Try extracting from parts array format
if hasattr(arch, "parts") and arch.parts:
return self._extract_from_parts(arch)
# Try extracting from data field directly
if hasattr(arch, "data"):
return self._extract_from_data(arch.data)
return "unknown"
def _extract_from_parts(self, arch: Any) -> str:
"""Extract architecture from parts array.
Returns:
Architecture string or "unknown".
"""
if len(arch.data) == 0:
return "unknown"
# Get index and validate
idx = arch.data[0] if isinstance(arch.data, (list, tuple)) else arch.data
if idx >= len(arch.parts):
return "unknown"
return self._decode_arch_part(arch.parts[idx])
def _decode_arch_part(self, arch_part: Any) -> str:
"""Decode architecture part to string.
Returns:
Decoded architecture string.
"""
if isinstance(arch_part, bytes):
return arch_part.decode("utf-8")
if isinstance(arch_part, str):
return arch_part
if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
# Handle nested format
if isinstance(arch_part[0], bytes):
return arch_part[0].decode("utf-8")
return str(arch_part[0])
return str(arch_part)
def _extract_from_data(self, data: Any) -> str:
"""Extract architecture from data field.
Returns:
Architecture string or "unknown".
"""
if isinstance(data, np.ndarray):
# Convert numpy array of bytes to string
try:
return bytes(data).decode("utf-8")
except (UnicodeDecodeError, ValueError):
# Fallback to ASCII conversion
return "".join(chr(c) for c in data if c < 128)
if isinstance(data, bytes):
return data.decode("utf-8")
if isinstance(data, str):
return data
return str(data)
def get_metadata(self) -> dict[str, Any]:
"""Extract all metadata from GGUF file.
Returns:
Dictionary of metadata fields and values.
"""
metadata: dict[str, Any] = {}
for key, field in self.reader.fields.items():
if field.types and field.data:
field_type = field.types[0]
field_data = field.parts[field.data[0]] if field.parts else field.data
# Convert data based on type
if field_type == gguf.GGUFValueType.STRING:
if isinstance(field_data, (list, tuple)) and field_data:
string_value = field_data[0]
if isinstance(string_value, bytes):
string_value = string_value.decode("utf-8")
metadata[key] = string_value
else:
metadata[key] = str(field_data)
elif field_type in {
gguf.GGUFValueType.UINT32,
gguf.GGUFValueType.INT32,
gguf.GGUFValueType.FLOAT32,
gguf.GGUFValueType.BOOL,
}:
metadata[key] = (
field.data[0] if isinstance(field.data, (list, tuple)) else field.data
)
elif field_type == gguf.GGUFValueType.ARRAY:
metadata[key] = list(field.data)
return metadata
def get_tensor_info(self) -> list[dict[str, Any]]:
"""Get information about all tensors in the file.
Returns:
List of tensor info dictionaries with name, shape, and type.
"""
tensor_info = []
for tensor in self.reader.tensors:
info = {
"name": tensor.name,
"shape": list(tensor.shape),
"type": tensor.tensor_type.name
if hasattr(tensor.tensor_type, "name")
else str(tensor.tensor_type),
"size_bytes": tensor.data.nbytes
if hasattr(tensor.data, "nbytes")
else len(tensor.data),
}
tensor_info.append(info)
return tensor_info
def get_quantisation_type(self) -> str | None:
"""Get the quantisation type of the GGUF file.
Returns:
Quantisation type string or None if not found.
"""
file_type = self.reader.fields.get("general.file_type")
if file_type and hasattr(file_type, "data"):
# Map numeric file type to string
file_type_value = (
file_type.data[0] if isinstance(file_type.data, (list, tuple)) else file_type.data
)
# Common file type mappings
file_type_map = {
0: "F32",
1: "F16",
2: "Q4_0",
3: "Q4_1",
7: "Q8_0",
8: "Q5_0",
9: "Q5_1",
10: "Q2_K",
11: "Q3_K_S",
12: "Q3_K_M",
13: "Q3_K_L",
14: "Q4_K_S",
15: "Q4_K_M",
16: "Q5_K_S",
17: "Q5_K_M",
18: "Q6_K",
}
return file_type_map.get(int(file_type_value), f"Unknown ({file_type_value})")
return None
def validate(self) -> bool:
"""Validate that the GGUF file is properly formatted.
Returns:
True if file is valid, False otherwise.
"""
try:
# Check basic structure
if not self.reader.fields:
logger.error("No metadata fields found")
return False
# Check for required fields
required_fields = ["general.architecture"]
for field in required_fields:
if field not in self.reader.fields:
logger.error(f"Missing required field: {field}")
return False
# Check tensors
if not self.reader.tensors:
logger.warning("No tensors found in file")
except Exception as e:
logger.error(f"Validation failed: {e}")
return False
else:
return True

374
helpers/gguf/writer.py Normal file
View file

@ -0,0 +1,374 @@
"""GGUF file writing operations.
Provides high-level interface for creating GGUF files with metadata,
tensors, and tokeniser information.
"""
from __future__ import annotations
import json
import operator
import traceback
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol
import gguf
from helpers.logger import logger
if TYPE_CHECKING:
import numpy as np
from helpers.models.conversion import ModelConfig
class VisionConfig(Protocol):
"""Protocol for vision model configuration."""
hidden_size: int
num_hidden_layers: int
num_attention_heads: int
intermediate_size: int
patch_size: int
spatial_merge_size: int
class GGUFWriter:
"""Manages GGUF file creation and metadata writing.
Provides high-level interface for GGUF file operations including metadata
configuration, tensor addition, and tokeniser integration. Encapsulates
low-level GGUF library interactions for consistent error handling.
"""
def __init__(self, output_path: Path, architecture: str) -> None:
"""Initialise GGUF writer with output path and architecture.
Creates the underlying GGUF writer instance and prepares for metadata
and tensor addition. Sets up the file structure for the specified
model architecture.
"""
self.output_path = output_path
self.architecture = architecture
self.writer = gguf.GGUFWriter(str(output_path), architecture)
logger.info(f"Created GGUF writer for {architecture} architecture")
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
"""Add comprehensive metadata from model configuration.
Writes general model information, architectural parameters, and
quantisation settings to the GGUF file header. Handles both standard
and vision model configurations with appropriate parameter mapping.
"""
# General metadata
self.writer.add_name(model_name)
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
# Log architecture being used
logger.info(f"Setting GGUF architecture: {self.architecture}")
if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
# Model parameters from config
params = model_config.to_gguf_params()
self.writer.add_context_length(params.context_length)
self.writer.add_embedding_length(params.embedding_length)
self.writer.add_block_count(params.block_count)
self.writer.add_feed_forward_length(params.feed_forward_length)
self.writer.add_head_count(params.attention_head_count)
self.writer.add_head_count_kv(params.attention_head_count_kv)
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
self.writer.add_rope_freq_base(params.rope_freq_base)
self.writer.add_rope_dimension_count(params.rope_dimension_count)
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
"""Add vision model parameters to GGUF metadata.
Configures vision-specific parameters for multimodal models including
embedding dimensions, attention heads, and spatial processing settings.
"""
if not vision_config:
return
logger.info("Adding vision model parameters...")
self.writer.add_vision_embedding_length(vision_config.hidden_size)
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
self.writer.add_vision_head_count(vision_config.num_attention_heads)
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
self.writer.add_vision_patch_size(vision_config.patch_size)
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
"""Add tokeniser metadata to GGUF file.
Writes special token IDs and tokeniser model type to enable proper
text processing during inference. Uses sensible defaults for missing
configuration values.
"""
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
# Add BOS/EOS token addition flags if available
if "add_bos_token" in tokeniser_config:
self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
if "add_eos_token" in tokeniser_config:
self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
# Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type
logger.info("Added tokeniser configuration")
def add_tokeniser_vocabulary(self, model_path: Path) -> None:
"""Add full tokeniser vocabulary to GGUF file.
Loads and embeds the complete tokeniser vocabulary including tokens,
merges, and scores to enable standalone model usage without external
tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
"""
tokenizer_path = model_path / "tokenizer.json"
if not tokenizer_path.exists():
logger.warning("tokenizer.json not found, skipping vocabulary embedding")
return
try:
with Path(tokenizer_path).open(encoding="utf-8") as f:
tokenizer_data = json.load(f)
model_data = tokenizer_data.get("model", {})
model_type = model_data.get("type", "")
# Get pre-tokenizer information
pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
# Get added tokens
added_tokens = tokenizer_data.get("added_tokens", [])
if model_type == "BPE":
self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
elif model_type == "Unigram":
self._add_unigram_tokenizer(model_data, added_tokens)
elif model_type == "WordPiece":
self._add_wordpiece_tokenizer(model_data, added_tokens)
else:
logger.warning(f"Unsupported tokenizer type: {model_type}")
# Try to add as generic tokenizer
self._add_generic_tokenizer(model_data, tokenizer_data)
except Exception as e:
logger.error(f"Failed to load tokeniser vocabulary: {e}")
logger.error(traceback.format_exc())
def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
"""Determine pre-tokenizer type from configuration.
Returns:
Pre-tokenizer type.
"""
if not pre_tokenizer:
return "default"
# Check for various pre-tokenizer types
pre_type = pre_tokenizer.get("type", "")
if "ByteLevel" in str(pre_type):
return "llama3"
if "Metaspace" in str(pre_type):
return "default"
return "default"
def _add_bpe_tokenizer(
self,
model_data: dict[str, Any],
added_tokens: list[dict[str, Any]],
pre_tokenizer_type: str,
) -> None:
"""Add BPE tokenizer to GGUF file."""
vocab = model_data.get("vocab", {})
merges = model_data.get("merges", [])
# Set tokenizer model based on pre-tokenizer type
if pre_tokenizer_type == "llama3":
self.writer.add_tokenizer_model("gpt2")
self.writer.add_tokenizer_pre("llama3")
else:
self.writer.add_tokenizer_model("gpt2")
# Create token list with scores
tokens = []
scores = []
toktypes = []
# Add vocabulary tokens
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
tokens.append(token_str)
scores.append(0.0) # BPE doesn't use scores
# Determine token type
is_added = any(t.get("content") == token_str for t in added_tokens)
if is_added:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
toktypes.append(gguf.TokenType.NORMAL)
# Add to writer
self.writer.add_token_list(tokens)
self.writer.add_token_scores(scores)
self.writer.add_token_types(toktypes)
# Add merges
if merges:
self.writer.add_token_merges(merges)
logger.info(f"Added BPE tokenizer: {len(tokens)} tokens, {len(merges)} merges")
def _add_unigram_tokenizer(
self,
model_data: dict[str, Any],
added_tokens: list[dict[str, Any]],
) -> None:
"""Add Unigram tokenizer to GGUF file."""
vocab = model_data.get("vocab", [])
self.writer.add_tokenizer_model("unigram")
# Create token list with scores
tokens = []
scores = []
toktypes = []
# Add vocabulary tokens
for token_data in vocab:
if isinstance(token_data, list) and len(token_data) >= 2:
token_str, score = token_data[0], token_data[1]
else:
continue
tokens.append(token_str)
scores.append(float(score))
# Determine token type
is_added = any(t.get("content") == token_str for t in added_tokens)
if is_added:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
toktypes.append(gguf.TokenType.NORMAL)
# Add to writer
self.writer.add_token_list(tokens)
self.writer.add_token_scores(scores)
self.writer.add_token_types(toktypes)
logger.info(f"Added Unigram tokenizer: {len(tokens)} tokens")
def _add_wordpiece_tokenizer(
self,
model_data: dict[str, Any],
added_tokens: list[dict[str, Any]],
) -> None:
"""Add WordPiece tokenizer to GGUF file."""
vocab = model_data.get("vocab", {})
self.writer.add_tokenizer_model("bert")
# Create token list
tokens = []
scores = []
toktypes = []
# Add vocabulary tokens
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
tokens.append(token_str)
scores.append(0.0) # WordPiece doesn't use scores
# Determine token type
is_added = any(t.get("content") == token_str for t in added_tokens)
if is_added:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
toktypes.append(gguf.TokenType.NORMAL)
# Add to writer
self.writer.add_token_list(tokens)
self.writer.add_token_scores(scores)
self.writer.add_token_types(toktypes)
logger.info(f"Added WordPiece tokenizer: {len(tokens)} tokens")
def _add_generic_tokenizer(
self,
model_data: dict[str, Any],
tokenizer_data: dict[str, Any],
) -> None:
"""Add generic tokenizer as fallback."""
logger.warning("Using generic tokenizer fallback")
# Try to extract vocabulary from various possible locations
vocab = model_data.get("vocab", tokenizer_data.get("vocab", {}))
if not vocab:
logger.error("No vocabulary found in tokenizer")
return
self.writer.add_tokenizer_model("gpt2") # Default to GPT-2 style
# Create basic token list
tokens = []
scores = []
toktypes = []
if isinstance(vocab, dict):
# Dict-style vocab
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
tokens.append(token_str)
scores.append(0.0)
toktypes.append(gguf.TokenType.NORMAL)
elif isinstance(vocab, list):
# List-style vocab
for item in vocab:
if isinstance(item, str):
tokens.append(item)
scores.append(0.0)
toktypes.append(gguf.TokenType.NORMAL)
elif isinstance(item, list) and len(item) >= 1:
tokens.append(str(item[0]))
scores.append(float(item[1]) if len(item) > 1 else 0.0)
toktypes.append(gguf.TokenType.NORMAL)
if tokens:
self.writer.add_token_list(tokens)
self.writer.add_token_scores(scores)
self.writer.add_token_types(toktypes)
logger.info(f"Added generic tokenizer: {len(tokens)} tokens")
else:
logger.error("Failed to extract tokens from vocabulary")
def add_tensor(self, name: str, data: np.ndarray) -> None:
"""Add tensor to GGUF file.
Accepts a tensor name following GGUF naming conventions and its
corresponding numpy array data. The tensor is stored for writing
when the file is finalised.
"""
self.writer.add_tensor(name, data)
def write(self) -> None:
"""Finalise and write GGUF file to disk.
Writes header, key-value data, and tensors to the output file,
completing the GGUF creation process.
"""
logger.info(f"Writing GGUF file to {self.output_path}...")
self.writer.write_header_to_file()
self.writer.write_kv_data_to_file()
self.writer.write_tensors_to_file()
self.writer.close()
logger.info("✅ GGUF file written successfully")

View file

@ -0,0 +1,19 @@
"""HuggingFace operations and integrations.
Provides client operations, repository management, and file upload
capabilities for HuggingFace repositories.
"""
from __future__ import annotations
from helpers.huggingface.client import HuggingFaceClient
from helpers.huggingface.repository import RepositoryManager
from helpers.huggingface.uploader import FileUploader
from helpers.huggingface.wrapper import HuggingFaceUploader
__all__ = [
"FileUploader",
"HuggingFaceClient",
"HuggingFaceUploader",
"RepositoryManager",
]

View file

@ -0,0 +1,124 @@
"""HuggingFace API client operations.
Provides basic HuggingFace API operations including authentication,
model downloads, and user information retrieval.
"""
from __future__ import annotations
import subprocess
from typing import TYPE_CHECKING
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
class HuggingFaceClient:
"""Manages basic HuggingFace API operations.
Provides methods for authentication verification, model downloads,
and user information retrieval using the HuggingFace CLI.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Retrieves the current user's HuggingFace username using the CLI.
Requires prior authentication via `huggingface-cli login`.
Returns:
HuggingFace username.
Raises:
RuntimeError: If not authenticated or CLI not available.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
@staticmethod
def download_model(
model_name: str,
output_dir: Path,
include_pattern: str | None = None,
) -> None:
"""Download model from HuggingFace.
Downloads a complete model or specific files matching a pattern.
Creates the output directory if it doesn't exist. Supports filtered
downloads for efficient bandwidth usage when only certain files are needed.
The model identifier follows HuggingFace naming conventions (e.g. "meta-llama/Llama-2-7b").
"""
logger.info(f"Downloading {model_name} to {output_dir}")
cmd = [
"huggingface-cli",
"download",
model_name,
"--local-dir",
str(output_dir),
]
if include_pattern:
cmd.extend(["--include", include_pattern])
subprocess.run(cmd, check=True, capture_output=True, text=True)
logger.info("Download complete")
@staticmethod
def check_authentication() -> bool:
"""Check if user is authenticated with HuggingFace.
Returns:
True if authenticated, False otherwise.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=False,
)
except FileNotFoundError:
logger.error(
"huggingface-cli not found. Please install with: pip install huggingface-hub"
)
return False
else:
return result.returncode == 0
@staticmethod
def get_model_info(model_id: str) -> dict | None:
"""Get model information from HuggingFace.
Retrieves metadata about a model from the HuggingFace Hub using the
CLI interface. Returns the model information as a dictionary if found.
Returns:
Model information dictionary or None if not found.
"""
try:
# Use huggingface-cli to get model info
result = subprocess.run(
["huggingface-cli", "model-info", model_id],
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError:
logger.warning(f"Could not get info for model: {model_id}")
return None
else:
# Parse the output (this is simplified - actual implementation would parse JSON)
return {"output": result.stdout}

View file

@ -0,0 +1,167 @@
"""HuggingFace repository management.
Handles repository creation, configuration, and management operations.
"""
from __future__ import annotations
import subprocess
import time
from helpers.logger import logger
class RepositoryManager:
"""Manages HuggingFace repository operations.
Provides methods for creating repositories, checking existence,
and managing repository configuration.
"""
@staticmethod
def create_repository(
repo_id: str,
private: bool = False,
repo_type: str = "model",
) -> bool:
"""Create a new HuggingFace repository.
Creates a repository with the specified identifier and settings. Repository
identifiers follow the format "username/repo-name". Supports model, dataset,
and space repository types with configurable visibility.
Returns:
True if repository was created, False if it already exists.
"""
logger.info(f"Creating repository: {repo_id}")
cmd = [
"huggingface-cli",
"repo",
"create",
repo_id,
"--type",
repo_type,
]
if private:
cmd.append("--private")
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=False,
)
if result.returncode == 0:
logger.info(f"Created repository: {repo_id}")
return True
if "already exists" in result.stderr.lower():
logger.info(f"Repository already exists: {repo_id}")
return False
logger.error(f"Failed to create repository: {result.stderr}")
except Exception as e:
logger.error(f"Error creating repository: {e}")
return False
@staticmethod
def ensure_repository_exists(repo_id: str) -> None:
"""Ensure repository exists, creating if necessary.
Attempts to create the repository if it doesn't exist, then waits
briefly to ensure the repository is ready for operations.
"""
# Try to create the repository
RepositoryManager.create_repository(repo_id)
# Small delay to ensure repository is ready
time.sleep(2)
@staticmethod
def check_repository_exists(repo_id: str) -> bool:
"""Check if a repository exists.
Queries the HuggingFace Hub to determine if a repository with the
given identifier exists and is accessible.
Returns:
True if repository exists, False otherwise.
"""
try:
result = subprocess.run(
["huggingface-cli", "repo", "ls-files", repo_id],
capture_output=True,
text=True,
check=False,
)
except Exception:
return False
else:
return result.returncode == 0
@staticmethod
def delete_repository(repo_id: str) -> bool:
"""Delete a HuggingFace repository.
Permanently removes a repository from the HuggingFace Hub. This operation
cannot be undone and requires appropriate permissions.
Returns:
True if deleted successfully, False otherwise.
"""
logger.warning(f"Deleting repository: {repo_id}")
try:
result = subprocess.run(
["huggingface-cli", "repo", "delete", repo_id, "--yes"],
capture_output=True,
text=True,
check=False,
)
if result.returncode == 0:
logger.info(f"Deleted repository: {repo_id}")
return True
logger.error(f"Failed to delete repository: {result.stderr}")
except Exception as e:
logger.error(f"Error deleting repository: {e}")
return False
else:
return False
@staticmethod
def get_repository_url(repo_id: str) -> str:
"""Get the full URL for a repository.
Constructs the complete HuggingFace Hub URL for accessing the repository
through a web browser.
Returns:
Full HuggingFace URL for the repository.
"""
return f"https://huggingface.co/{repo_id}"
@staticmethod
def set_repository_visibility(repo_id: str, private: bool) -> bool:
"""Set repository visibility (public/private).
Changes the visibility setting of an existing repository. Private repositories
require appropriate permissions and may have usage limitations.
Returns:
True if visibility changed successfully.
"""
visibility = "private" if private else "public"
logger.info(f"Setting {repo_id} visibility to {visibility}")
try:
# Note: This would require using the HuggingFace API directly
# as the CLI doesn't support changing visibility
logger.warning("Changing repository visibility requires API access")
except Exception as e:
logger.error(f"Error changing visibility: {e}")
return False

View file

@ -0,0 +1,330 @@
"""HuggingFace file upload operations.
Handles uploading files to HuggingFace repositories with retry logic
and error handling.
"""
from __future__ import annotations
import shutil
import subprocess
import tempfile
import time
from pathlib import Path
from helpers.huggingface.repository import RepositoryManager
from helpers.logger import logger
class FileUploader:
"""Manages file uploads to HuggingFace repositories.
Provides methods for uploading models, READMEs, and other files
with proper error handling, retry logic, and git-based fallbacks.
"""
@staticmethod
def upload_file(
repo_id: str,
local_path: Path,
repo_path: str | None = None,
create_repo: bool = False,
) -> None:
"""Upload a file to HuggingFace repository.
Uploads a single file to the specified repository path. Can create
the repository if it doesn't exist. Uses git directly when possible
to avoid automatic PR creation. Repository identifiers follow the format
"username/repo-name". Files are uploaded to the main branch by default.
Raises:
CalledProcessError: If upload fails.
"""
repo_path = repo_path or local_path.name
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
# Try git-based upload first to avoid PR creation
if FileUploader._try_git_upload(repo_id, local_path, repo_path, create_repo=create_repo):
logger.info(f"Uploaded {repo_path} via git")
return
# Fallback to huggingface-cli
logger.info("Git upload failed, trying huggingface-cli...")
cmd = [
"huggingface-cli",
"upload",
repo_id,
str(local_path),
repo_path,
"--revision",
"main", # Explicitly push to main branch
"--commit-message",
f"Add {repo_path}",
]
if create_repo:
cmd.append("--create")
try:
subprocess.run(cmd, check=True, capture_output=True)
logger.info(f"Uploaded {repo_path}")
except subprocess.CalledProcessError:
if create_repo:
# Repository might already exist, retry without --create
cmd = cmd[:-1] # Remove --create flag
subprocess.run(cmd, check=True, capture_output=True, text=True)
logger.info(f"Updated {repo_path}")
else:
raise
@staticmethod
def _try_git_upload(
repo_id: str,
local_path: Path,
repo_path: str,
*,
create_repo: bool = False,
) -> bool:
"""Try to upload file using git directly to avoid PR creation.
Returns:
bool: True if upload successful, False if should fallback to CLI.
"""
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
repo_url = f"https://huggingface.co/{repo_id}"
# Clone repository
logger.info(f"Cloning {repo_url}...")
result = subprocess.run(
["git", "clone", repo_url, str(temp_path / "repo")],
check=False,
capture_output=True,
text=True,
)
if result.returncode != 0:
if create_repo:
# Repository doesn't exist, let huggingface-cli handle creation
return False
logger.warning(f"Clone failed: {result.stderr}")
return False
repo_dir = temp_path / "repo"
target_file = repo_dir / repo_path
# Ensure target directory exists
target_file.parent.mkdir(parents=True, exist_ok=True)
# Copy file
shutil.copy2(local_path, target_file)
# Check if there are any changes
status_result = subprocess.run(
["git", "status", "--porcelain"],
cwd=repo_dir,
capture_output=True,
text=True,
check=True,
)
if not status_result.stdout.strip():
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
return True # File is already up-to-date, no need to push
# Git add, commit, push
subprocess.run(
["git", "add", repo_path],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "commit", "-m", f"Update {repo_path}"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "push"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
return True
except subprocess.CalledProcessError as e:
logger.warning(f"Git upload failed: {e}")
return False
except Exception as e:
logger.warning(f"Git upload error: {e}")
return False
@staticmethod
def upload_readme(
repo_id: str,
readme_path: Path,
ensure_repo: bool = True,
) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
The README is uploaded as README.md in the repository root and will
replace any existing README file.
Raises:
RuntimeError: If the README upload fails.
"""
logger.info("Uploading README...")
# Add delay to prevent rate limiting
time.sleep(2)
# First ensure the repository exists if requested
if ensure_repo:
RepositoryManager.ensure_repository_exists(repo_id)
# Upload without --create flag to avoid PR creation
try:
logger.debug(f"DEBUG: Uploading README to {repo_id}")
subprocess.run(
[
"huggingface-cli",
"upload",
repo_id,
str(readme_path),
"README.md",
"--commit-message",
"Update README.md",
],
check=True,
capture_output=True,
text=True,
)
logger.info("README uploaded successfully")
except subprocess.CalledProcessError as e:
# Retry with delay in case of rate limiting
if "429" in str(e.stderr):
logger.warning("Rate limited, waiting 30 seconds...")
time.sleep(30)
subprocess.run(
[
"huggingface-cli",
"upload",
repo_id,
str(readme_path),
"README.md",
"--commit-message",
"Update README.md",
],
check=True,
capture_output=True,
text=True,
)
logger.info("README uploaded successfully (after retry)")
else:
msg = f"Failed to upload README: {e.stderr}"
raise RuntimeError(msg) from e
@staticmethod
def upload_model_file(
repo_id: str,
model_path: Path,
repo_filename: str | None = None,
) -> None:
"""Upload a model file to repository.
Optimised for large model file uploads with progress tracking.
The model file is uploaded to the repository root by default or
to the specified filename if provided.
Raises:
subprocess.CalledProcessError: If the upload fails.
"""
repo_filename = repo_filename or model_path.name
logger.info(
f"Uploading model file {model_path.name} "
f"({model_path.stat().st_size / (1024**3):.1f}GB)..."
)
cmd = [
"huggingface-cli",
"upload",
repo_id,
str(model_path),
repo_filename,
"--commit-message",
f"Add {repo_filename}",
]
try:
# Run with output streaming for large files
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
universal_newlines=True,
)
# Stream output
if process.stdout:
for line in iter(process.stdout.readline, ""):
if line and "upload" in line.lower():
logger.debug(line.strip())
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(process.returncode, cmd)
logger.info(f"Successfully uploaded {repo_filename}")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to upload model file: {e}")
raise
@staticmethod
def upload_folder(
repo_id: str,
folder_path: Path,
path_in_repo: str = ".",
ignore_patterns: list[str] | None = None,
) -> None:
"""Upload an entire folder to repository.
Recursively uploads all files from a local folder to the repository,
preserving the directory structure. Supports ignore patterns for
selective uploads.
Raises:
subprocess.CalledProcessError: If the upload fails.
"""
logger.info(f"Uploading folder {folder_path} to {repo_id}/{path_in_repo}")
cmd = [
"huggingface-cli",
"upload",
repo_id,
str(folder_path),
path_in_repo,
"--commit-message",
f"Upload {folder_path.name}",
]
if ignore_patterns:
for pattern in ignore_patterns:
cmd.extend(["--exclude", pattern])
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
logger.info(f"Successfully uploaded folder {folder_path.name}")
except subprocess.CalledProcessError as e:
logger.error(f"Failed to upload folder: {e}")
raise

View file

@ -0,0 +1,57 @@
"""Compatibility wrapper for HuggingFace operations.
Provides a compatible interface matching the old HuggingFaceUploader
class for backward compatibility during refactoring.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from helpers.huggingface.client import HuggingFaceClient
from helpers.huggingface.repository import RepositoryManager
from helpers.huggingface.uploader import FileUploader
if TYPE_CHECKING:
from pathlib import Path
class HuggingFaceUploader:
"""Compatibility wrapper for HuggingFace operations.
Maintains the same interface as the old HuggingFaceUploader class
while using the new modular components internally.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Returns:
HuggingFace username from CLI authentication.
"""
return HuggingFaceClient.get_username()
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
The README is uploaded to the repository root as README.md.
"""
FileUploader.upload_readme(output_repo, readme_path, ensure_repo=True)
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
"""Upload model file to repository.
Uploads GGUF model file to specified repository path. The file
is uploaded with progress tracking suitable for large model files.
"""
FileUploader.upload_model_file(output_repo, model_path)
def _ensure_repo_exists(self, repo_id: str) -> None:
"""Ensure the repository exists, creating it if necessary.
Creates the repository if it doesn't exist and waits briefly
to ensure it's ready for subsequent operations.
"""
RepositoryManager.ensure_repository_exists(repo_id)

View file

@ -0,0 +1,20 @@
"""llama.cpp operations and binary management.
Provides interfaces to llama.cpp binaries for quantisation and
importance matrix generation.
"""
from __future__ import annotations
from helpers.llama_cpp.architecture import ArchitectureDetector
from helpers.llama_cpp.binary_manager import BinaryManager
from helpers.llama_cpp.imatrix import IMatrixGenerator, IMatrixHandler
from helpers.llama_cpp.quantiser import QuantisationExecutor
__all__ = [
"ArchitectureDetector",
"BinaryManager",
"IMatrixGenerator",
"IMatrixHandler",
"QuantisationExecutor",
]

View file

@ -0,0 +1,235 @@
"""Architecture detection and support checking.
Determines whether model architectures are supported by llama.cpp
and provides fallback strategies for unsupported architectures.
"""
from __future__ import annotations
import subprocess
from typing import TYPE_CHECKING
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
class ArchitectureDetector:
"""Detects and validates model architecture support.
Checks whether model architectures are supported by llama.cpp
for K-quant generation and determines appropriate quantisation
strategies for unsupported architectures.
"""
@staticmethod
def check_architecture_support(f16_model_path: Path) -> bool:
"""Check if the model architecture is supported by llama.cpp.
Tests the model's compatibility by attempting a quantisation with
llama.cpp. Returns true if the architecture is unsupported, indicating
that K-quants should be skipped.
Returns:
True if architecture is NOT supported (K-quants should be skipped)
"""
try:
# Try a simple quantization with llama.cpp to check support
result = subprocess.run(
[
".cache/llm-gguf-tools/binaries/llama-quantize",
str(f16_model_path),
"/dev/null",
"Q4_K_M",
],
check=False,
capture_output=True,
text=True,
timeout=5,
)
# Check if it failed due to unknown architecture
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
except Exception:
# If we can't determine, assume it might work
return False
@staticmethod
def get_supported_architectures() -> list[str]:
"""Get list of architectures known to be supported by llama.cpp.
Returns:
List of supported architecture names.
"""
return [
"llama",
"llama2",
"llama3",
"mistral",
"mixtral",
"qwen",
"qwen2",
"gemma",
"gemma2",
"phi",
"phi2",
"phi3",
"falcon",
"gpt2",
"gptj",
"gptneox",
"mpt",
"starcoder",
"starcoder2",
"baichuan",
"bert",
"bloom",
"deepseek",
"deepseek2",
"chatglm",
"orion",
"internlm2",
"minicpm",
"stablelm",
"cohere",
"dbrx",
"olmo",
"arctic",
"rwkv",
]
@staticmethod
def map_architecture(model_type: str, arch_name: str) -> str:
"""Map model architecture to GGUF architecture string.
Translates model type and architecture names from HuggingFace config
to GGUF-compatible architecture identifiers. Handles special cases like
"gpt-oss" to "gptoss" conversion and provides fallback mapping.
Returns:
GGUF architecture string to use.
"""
# Direct mappings from model_type
type_mappings = {
"llama": "llama",
"mistral": "llama", # Mistral uses llama architecture
"mixtral": "llama",
"qwen": "qwen",
"qwen2": "qwen2",
"gemma": "gemma",
"gemma2": "gemma2",
"phi": "phi2",
"phi3": "phi3",
"phi-msft": "phi2",
"falcon": "falcon",
"gpt2": "gpt2",
"gptj": "gptj",
"gpt_neox": "gptneox",
"gpt-oss": "gptoss",
"mpt": "mpt",
"starcoder": "starcoder",
"starcoder2": "starcoder2",
"baichuan": "baichuan",
"bloom": "bloom",
"chatglm": "chatglm",
"deepseek": "llama", # DeepSeek uses llama architecture
"stablelm": "stablelm",
"cohere": "cohere",
"dbrx": "dbrx",
"olmo": "olmo",
"arctic": "arctic",
}
# Check model_type first
if model_type in type_mappings:
return type_mappings[model_type]
# Architecture name mappings as fallback
arch_mappings = {
"LlamaForCausalLM": "llama",
"MistralForCausalLM": "llama",
"MixtralForCausalLM": "llama",
"Qwen2ForCausalLM": "qwen2",
"QwenForCausalLM": "qwen",
"GemmaForCausalLM": "gemma",
"Gemma2ForCausalLM": "gemma2",
"GptOssForCausalLM": "gptoss",
"PhiForCausalLM": "phi2",
"Phi3ForCausalLM": "phi3",
"FalconForCausalLM": "falcon",
"GPT2LMHeadModel": "gpt2",
"GPTJForCausalLM": "gptj",
"GPTNeoXForCausalLM": "gptneox",
"MPTForCausalLM": "mpt",
"BloomForCausalLM": "bloom",
"ChatGLMForCausalLM": "chatglm",
"StableLmForCausalLM": "stablelm",
"CohereForCausalLM": "cohere",
}
if arch_name in arch_mappings:
return arch_mappings[arch_name]
# Default fallback
logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})")
logger.warning("Defaulting to 'llama' architecture - may not work correctly")
return "llama"
@staticmethod
def get_quantisation_support(architecture: str) -> dict[str, bool]:
"""Determine which quantisation types are supported for an architecture.
Evaluates architecture compatibility with different quantisation methods.
Basic quantisations are always supported via GGML, while K-quants and
imatrix require specific llama.cpp support.
Returns:
Dictionary mapping quantisation type categories to support status.
"""
# Known unsupported architectures for K-quants
unsupported_kquants = [
"bert",
"dotsocr", # Custom/unknown architectures
]
is_supported = architecture not in unsupported_kquants
return {
"basic": True, # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML
"k_quants": is_supported, # K-quants require llama.cpp support
"imatrix": is_supported, # imatrix requires llama.cpp support
}
@staticmethod
def filter_quantisation_types(
architecture: str,
requested_types: list[str],
) -> tuple[list[str], list[str]]:
"""Filter quantisation types based on architecture support.
Separates requested quantisation types into supported and unsupported
based on the model's architecture capabilities. Basic types are always
supported, while K-quants depend on architecture compatibility.
Returns:
Tuple of (supported_types, skipped_types).
"""
support = ArchitectureDetector.get_quantisation_support(architecture)
basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}
supported = []
skipped = []
for quant_type in requested_types:
if quant_type in basic_types:
# Basic types always supported
supported.append(quant_type)
elif support["k_quants"]:
# K-quants supported for this architecture
supported.append(quant_type)
else:
# K-quants not supported
skipped.append(quant_type)
return supported, skipped

View file

@ -0,0 +1,494 @@
"""Binary manager for llama.cpp releases.
Downloads and manages llama.cpp binary releases from GitHub, handling
platform detection, version checking, and caching.
"""
from __future__ import annotations
import json
import os
import platform
import shutil
import subprocess
import tarfile
import time
import zipfile
from pathlib import Path
from typing import TYPE_CHECKING, ClassVar
from urllib.request import urlopen, urlretrieve
from helpers.logger import logger
if TYPE_CHECKING:
from typing import Any
class BinaryManager:
"""Manages llama.cpp binary downloads and updates.
Automatically downloads appropriate llama.cpp releases based on platform,
caches binaries locally, and checks for updates from GitHub releases.
"""
GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest"
# Use local .cache directory in project
BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries"
# Platform mappings to release asset patterns
PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = {
("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"],
("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"],
("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"],
("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"],
("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"],
}
def __init__(self) -> None:
"""Initialise binary manager."""
self.BINARY_DIR.mkdir(parents=True, exist_ok=True)
self.version_file = self.BINARY_DIR / "version.json"
self.quantize_binary_path = self._get_binary_path("llama-quantize")
self.imatrix_binary_path = self._get_binary_path("llama-imatrix")
def _get_binary_path(self, base_name: str) -> Path:
"""Get path to binary.
Constructs the full path to a binary executable based on the base
name, automatically adding the appropriate file extension for the
current operating system platform.
Returns:
Path where binary should be located.
"""
binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name
return self.BINARY_DIR / binary_name
def get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if available, None if download fails.
"""
return self._get_binary("llama-quantize", self.quantize_binary_path)
def get_imatrix_binary(self) -> Path | None:
"""Get llama-imatrix binary, downloading if necessary.
Returns:
Path to binary if available, None if download fails.
"""
return self._get_binary("llama-imatrix", self.imatrix_binary_path)
def _get_binary(self, name: str, binary_path: Path) -> Path | None:
"""Get a specific binary, downloading if necessary.
Checks for existing binaries and downloads the latest release if
updates are needed. Falls back to existing binaries if download
fails, ensuring robust binary availability for quantisation tasks.
Returns:
Path to binary if available, None if download fails.
"""
# Check if we have a binary and if it needs updating
if self._should_update():
logger.info("🔄 Checking for llama.cpp updates...")
if not self._download_latest():
logger.warning("Failed to download latest llama.cpp release")
# Fall back to existing binary if available
if binary_path.exists():
logger.info(f"Using existing {name} binary")
return binary_path
return None
if binary_path.exists():
return binary_path
logger.info("📥 Downloading llama.cpp binaries...")
if self._download_latest():
return binary_path
return None
def _should_update(self) -> bool:
"""Check if binary needs updating.
Returns:
True if update needed, False otherwise.
"""
# If no binaries exist, we need to download
if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists():
return True
# Check version file
if not self.version_file.exists():
return True
try:
with Path(self.version_file).open(encoding="utf-8") as f:
cached_version = json.load(f)
# Check if cached version is older than 7 days
if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600:
return True
except Exception:
return True
return False
def _download_latest(self) -> bool:
"""Download latest llama.cpp release.
Returns:
True if successful, False otherwise.
"""
try:
# Get latest release info
release_info = self._get_latest_release()
if not release_info:
return False
# Find appropriate asset for platform
asset_url = self._find_platform_asset(release_info["assets"])
if not asset_url:
logger.warning("No suitable binary found for this platform")
return False
# Download and extract
logger.info(f"📥 Downloading from: {asset_url}")
if not self._download_and_extract(asset_url):
return False
# Save version info
self._save_version_info(release_info)
logger.info("✅ Successfully downloaded llama.cpp binary")
except Exception as e:
logger.error(f"Failed to download llama.cpp: {e}")
return False
else:
return True
def _get_latest_release(self) -> dict[str, Any] | None:
"""Get latest release info from GitHub API.
Returns:
Release info dict or None if failed.
"""
try:
with urlopen(self.GITHUB_API) as response: # noqa: S310
return json.loads(response.read())
except Exception as e:
logger.error(f"Failed to fetch release info: {e}")
return None
def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None:
"""Find appropriate asset for current platform.
Returns:
Download URL for appropriate asset or None.
"""
patterns = self._get_platform_patterns()
if not patterns:
return None
return self._select_best_asset(assets, patterns)
def _get_platform_patterns(self) -> list[str]:
"""Get platform patterns for current system.
Returns:
List of patterns to match in asset names.
"""
system = platform.system()
machine = platform.machine()
# Get specific patterns for this platform
patterns = self.PLATFORM_PATTERNS.get((system, machine), [])
if patterns:
return patterns
# Fall back to generic patterns
generic_patterns = {
"Linux": ["linux", "ubuntu"],
"Darwin": ["macos", "darwin"],
"Windows": ["win", "windows"],
}
return generic_patterns.get(system, [])
def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None:
"""Select the best asset from available options.
Returns:
Download URL for best matching asset or None.
"""
avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"]
prefer_patterns = ["cpu", "vulkan", "avx2", "avx"]
best_asset = None
best_score = -1
for asset in assets:
name = asset["name"].lower()
# Skip GPU-specific builds
if any(pattern in name for pattern in avoid_patterns):
continue
# Check platform match
if not any(pattern in name for pattern in patterns):
continue
score = self._score_asset(name, patterns, prefer_patterns)
if score > best_score:
best_score = score
best_asset = asset
return best_asset["browser_download_url"] if best_asset else None
def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int:
"""Score an asset based on platform and preference matching.
Returns:
Numeric score for asset quality (higher is better).
"""
score = 0
# Platform match bonus
if any(pattern in name for pattern in patterns):
score += 10
# Preference bonuses
for pattern in prefer_patterns:
if pattern in name:
score += 5
# Archive format preference
system = platform.system()
if (system == "Windows" and name.endswith(".zip")) or (
system != "Windows" and name.endswith(".tar.gz")
):
score += 2
return score
def _download_and_extract(self, url: str) -> bool:
"""Download and extract binary archive.
Downloads the binary archive from the specified URL and extracts
the necessary binaries and shared libraries. Handles both ZIP and
TAR.GZ formats with appropriate platform-specific permissions.
Returns:
True if successful, False otherwise.
"""
try:
# Download to temp file
temp_file = self.BINARY_DIR / "temp_download"
logger.info("⬇️ Downloading archive...")
urlretrieve(url, temp_file) # noqa: S310
# Extract based on file type
if url.endswith(".zip"):
with zipfile.ZipFile(temp_file, "r") as zf:
self._extract_binary_from_archive(zf)
elif url.endswith((".tar.gz", ".tgz")):
with tarfile.open(temp_file, "r:gz") as tf:
self._extract_binary_from_archive(tf)
else:
logger.error(f"Unknown archive format: {url}")
return False
# Clean up temp file
temp_file.unlink()
# Make binaries executable on Unix
if platform.system() != "Windows":
self.quantize_binary_path.chmod(0o755)
self.imatrix_binary_path.chmod(0o755)
except Exception as e:
logger.error(f"Failed to download and extract: {e}")
return False
else:
return True
def _extract_binary_from_archive(self, archive: Any) -> None:
"""Extract llama binaries and their dependencies from archive."""
target_binaries = {
"llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"],
"llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"],
}
# Also extract shared libraries
shared_libs = [
"libllama.so",
"libggml-base.so",
"libggml.so",
"libllama.dll",
"libggml.dll",
]
members = self._get_archive_members(archive)
extracted = self._extract_matching_binaries(archive, members, target_binaries)
self._extract_shared_libraries(archive, members, shared_libs)
self._cleanup_extracted_directories()
self._report_missing_binaries(extracted)
def _get_archive_members(self, archive: Any) -> list[str]:
"""Get list of members from archive.
Returns:
List of member names in the archive.
"""
if isinstance(archive, zipfile.ZipFile):
return archive.namelist()
return [m.name for m in archive.getmembers()]
def _extract_matching_binaries(
self,
archive: Any,
members: list[str],
target_binaries: dict[str, list[str]],
) -> set[str]:
"""Extract binaries that match target patterns.
Returns:
Set of successfully extracted binary types.
"""
extracted = set()
for member in members:
base_name = Path(member).name
for binary_type, possible_names in target_binaries.items():
if base_name in possible_names:
self._extract_single_binary(archive, member, binary_type)
extracted.add(binary_type)
break
return extracted
def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None:
"""Extract a single binary from archive."""
logger.info(f"📦 Extracting {Path(member).name} as {binary_type}...")
target_path = self._get_binary_path(binary_type)
if isinstance(archive, zipfile.ZipFile):
self._extract_from_zip(archive, member, target_path)
else: # tarfile
self._extract_from_tar(archive, member, target_path)
def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None:
"""Extract binary from zip archive."""
temp_path = self.BINARY_DIR / "temp_binary"
with archive.open(member) as source, temp_path.open("wb") as target:
shutil.copyfileobj(source, target)
shutil.move(str(temp_path), str(target_path))
def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None:
"""Extract binary from tar archive."""
archive.extract(member, self.BINARY_DIR)
extracted_path = self.BINARY_DIR / member
if extracted_path != target_path:
shutil.move(str(extracted_path), str(target_path))
def _cleanup_extracted_directories(self) -> None:
"""Clean up any extracted directories."""
for item in self.BINARY_DIR.iterdir():
if item.is_dir() and item.name != "binaries":
shutil.rmtree(item)
def _extract_shared_libraries(
self, archive: Any, members: list[str], lib_patterns: list[str]
) -> None:
"""Extract shared libraries needed by the binaries.
Searches through archive members to find shared libraries matching
the specified patterns and extracts them to ensure proper binary
functionality. Sets appropriate permissions on Unix systems.
"""
for member in members:
base_name = Path(member).name
if any(lib in base_name for lib in lib_patterns):
logger.info(f"📚 Extracting library: {base_name}")
target_path = self.BINARY_DIR / base_name
if isinstance(archive, zipfile.ZipFile):
temp_path = self.BINARY_DIR / "temp_lib"
with archive.open(member) as source, temp_path.open("wb") as target:
shutil.copyfileobj(source, target)
shutil.move(str(temp_path), str(target_path))
else: # tarfile
archive.extract(member, self.BINARY_DIR)
extracted_path = self.BINARY_DIR / member
if extracted_path != target_path:
shutil.move(str(extracted_path), str(target_path))
# Make libraries executable on Unix
if platform.system() != "Windows":
target_path.chmod(0o755)
def _report_missing_binaries(self, extracted: set[str]) -> None:
"""Report any missing binaries."""
if "llama-quantize" not in extracted:
logger.warning("llama-quantize binary not found in archive")
if "llama-imatrix" not in extracted:
logger.warning("llama-imatrix binary not found in archive")
def _save_version_info(self, release_info: dict[str, Any]) -> None:
"""Save version information to cache.
Stores release version, timestamp, and URL information to the local
cache to enable version checking and update determination for
future binary manager operations.
"""
version_data = {
"version": release_info.get("tag_name", "unknown"),
"timestamp": time.time(),
"url": release_info.get("html_url", ""),
}
with Path(self.version_file).open("w", encoding="utf-8") as f:
json.dump(version_data, f, indent=2)
logger.info(f"📌 Cached version: {version_data['version']}")
def check_binary_works(self, binary_path: Path | None = None) -> bool:
"""Check if the binary actually works.
Validates that the specified binary can execute properly by running
a help command with appropriate environment variables set for shared
library loading. Defaults to checking the quantise binary if no path provided.
Returns:
True if binary executes successfully, False otherwise.
"""
if binary_path is None:
binary_path = self.quantize_binary_path
if not binary_path.exists():
return False
try:
# Set LD_LIBRARY_PATH to include binary directory for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
result = subprocess.run(
[str(binary_path), "--help"],
check=False,
capture_output=True,
text=True,
timeout=5,
env=env,
)
except Exception:
return False
else:
# llama-quantize returns 1 for --help but shows usage, which means it works
return result.returncode in {0, 1} and "usage:" in result.stdout.lower()

View file

@ -0,0 +1,322 @@
"""Importance matrix operations for llama.cpp.
Handles importance matrix generation and management for improved
quantisation quality.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.llama_cpp.binary_manager import BinaryManager
from helpers.logger import logger
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource
class IMatrixHandler:
"""Handles importance matrix file management.
Manages detection and use of existing importance matrix files for
quantisation guidance.
"""
def __init__(self) -> None:
"""Initialise IMatrixHandler."""
self.fs = FilesystemService()
def find_imatrix(self, model_dir: Path) -> Path | None:
"""Find existing imatrix file in model directory.
Returns:
Path to imatrix file if found, None otherwise.
"""
imatrix_path = model_dir / "imatrix.dat"
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
return imatrix_path
return None
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
"""Prompt user for existing imatrix file.
Returns:
Path to user-provided imatrix, or None if not available.
"""
imatrix_path = model_dir / "imatrix.dat"
logger.info(f"Model directory: {model_dir}")
logger.info(f"Looking for imatrix file at: {imatrix_path}")
logger.info(
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
)
logger.info(
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
)
response = (
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
.strip()
.lower()
)
if response != "y":
return None
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found imatrix file! ({file_size})")
return imatrix_path
logger.warning("No imatrix.dat file found - continuing without imatrix")
return None
class IMatrixGenerator:
"""Generates importance matrices for quantisation guidance.
Uses llama-imatrix binary to compute importance matrices from
calibration data, which helps preserve model quality during
quantisation by identifying critical weights.
"""
# Default calibration data location
CALIBRATION_DATA = Path("resources") / "imatrix_data.txt"
def __init__(self) -> None:
"""Initialise imatrix generator."""
self.binary_manager = BinaryManager()
self.imatrix_binary = self._get_imatrix_binary()
def _get_imatrix_binary(self) -> Path | None:
"""Get llama-imatrix binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-imatrix")
if local_binary.exists():
logger.info(f"Using local llama-imatrix binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_imatrix_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-imatrix binary: {binary_path}")
return binary_path
logger.warning("llama-imatrix binary not available")
return None
def can_generate(self) -> bool:
"""Check if imatrix generation is available.
Returns:
True if binary and calibration data are available.
"""
return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists()
def generate_imatrix(
self,
f16_model_path: Path,
output_path: Path,
calibration_data: Path | None = None,
) -> bool:
"""Generate importance matrix for a model.
Returns:
True if generation successful, False otherwise.
"""
validation_error = self._validate_generation_inputs(f16_model_path, calibration_data)
if validation_error:
logger.error(validation_error)
return False
cal_data = calibration_data or self.CALIBRATION_DATA
cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path)
self._log_generation_start(f16_model_path, cal_data, output_path)
return self._execute_imatrix_generation(cmd, output_path)
def _validate_generation_inputs(
self,
f16_model_path: Path,
calibration_data: Path | None,
) -> str | None:
"""Validate inputs for imatrix generation.
Returns:
Error message if validation fails, None if valid.
"""
if not self.imatrix_binary:
return "llama-imatrix binary not available"
if not f16_model_path.exists():
return f"Model file not found: {f16_model_path}"
cal_data = calibration_data or self.CALIBRATION_DATA
if not cal_data.exists():
return f"Calibration data not found: {cal_data}"
return None
def _build_imatrix_command(
self,
f16_model_path: Path,
cal_data: Path,
output_path: Path,
) -> list[str]:
"""Build command for imatrix generation.
Returns:
Command list ready for subprocess execution.
"""
return [
str(self.imatrix_binary),
"-m",
str(f16_model_path),
"-f",
str(cal_data),
"-o",
str(output_path),
"--chunks",
"128", # Process in chunks for stability
]
def _log_generation_start(
self,
f16_model_path: Path,
cal_data: Path,
output_path: Path,
) -> None:
"""Log the start of imatrix generation."""
logger.info("🧮 Generating importance matrix...")
logger.info(f"📊 Model: {f16_model_path.name}")
logger.info(f"📝 Calibration data: {cal_data.name}")
logger.info(f"💾 Output: {output_path.name}")
def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool:
"""Execute the imatrix generation process.
Returns:
True if generation completed successfully, False otherwise.
"""
# Set LD_LIBRARY_PATH for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
env=env,
)
self._stream_process_output(process)
return self._handle_process_completion(process, output_path)
except Exception as e:
logger.error(f"❌ Imatrix generation failed: {e}")
return False
def _stream_process_output(self, process: subprocess.Popen[str]) -> None:
"""Stream output from the running process."""
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
# Filter progress updates for cleaner output
line = output.strip()
if line and not line.startswith("["):
logger.info(f" {line}")
def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool:
"""Handle completion of the imatrix generation process.
Returns:
True if process completed successfully and output exists, False otherwise.
"""
return_code = process.poll()
if return_code != 0:
logger.error(f"❌ Imatrix generation failed with return code {return_code}")
return False
if not output_path.exists():
logger.error("Generation completed but output file not found")
return False
size_mb = output_path.stat().st_size / (1024 * 1024)
logger.info(f"✅ Generated imatrix: {output_path.name} ({size_mb:.1f} MB)")
return True
def prompt_for_generation(
self,
model_source: ModelSource,
model_dir: Path,
f16_model_path: Path,
) -> Path | None:
"""Prompt user to generate imatrix.
Interactively prompts the user to generate an importance matrix
for enhanced quantisation quality using the model source information,
directory, and F16 model path. Checks binary availability before prompting.
Returns:
Path to generated imatrix or None if skipped.
"""
if not self.can_generate():
logger.info("⚠️ Imatrix generation not available (missing binary or calibration data)")
return None
logger.info("\n" + "=" * 70)
logger.info("📊 Importance Matrix Generation")
logger.info("=" * 70)
logger.info(
"\nImportance matrices improve quantisation quality by identifying"
"\ncritical weights in the model. This process takes 5-10 minutes"
"\nbut significantly improves the quality of smaller quantisations."
)
logger.info(f"\nModel: {model_source.model_name}")
logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}")
response = input("\n❓ Generate importance matrix? (Y/n): ").strip().lower()
if response == "n":
logger.info("Skipping imatrix generation")
return None
# Generate imatrix
output_path = model_dir / "imatrix.dat"
logger.info("\n⏳ Generating importance matrix (this may take 5-10 minutes)...")
if self.generate_imatrix(f16_model_path, output_path):
return output_path
logger.warning("Failed to generate imatrix, continuing without it")
return None

View file

@ -0,0 +1,219 @@
"""Direct llama.cpp quantisation execution.
Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.llama_cpp.binary_manager import BinaryManager
from helpers.logger import logger
if TYPE_CHECKING:
from helpers.models.quantisation import QuantisationConfig
class QuantisationExecutor:
"""Executes llama.cpp quantisation with tensor overrides.
Provides direct binary execution with proper command-line flags for
tensor-specific overrides, supporting Bartowski-style L and XL variants.
"""
def __init__(self) -> None:
"""Initialise quantisation executor."""
self.fs = FilesystemService()
self.binary_manager = BinaryManager()
self.quantise_binary = self._get_quantise_binary()
self.last_error: str | None = None # Track last error type
def _get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-quantize")
if local_binary.exists():
logger.info(f"Using local llama-quantize binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_quantise_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-quantize binary: {binary_path}")
return binary_path
logger.error("Failed to obtain llama-quantize binary")
logger.info(
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
)
return None
def execute_quantisation(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None = None,
) -> bool:
"""Execute quantisation using llama.cpp binary.
Builds and executes llama-quantize command with proper tensor override
flags for L and XL variants.
Returns:
True if quantisation successful, False otherwise.
"""
if not self.quantise_binary:
logger.error("llama-quantize binary not available")
return False
# Build command
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
# Execute with real-time output
return self._execute_command(cmd)
def _build_quantisation_command(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None,
) -> list[str]:
"""Build llama-quantize command with tensor overrides.
Returns:
Command arguments as list.
"""
cmd = [str(self.quantise_binary)]
# Add imatrix if available
if imatrix_path:
cmd.extend(["--imatrix", str(imatrix_path)])
# Add tensor overrides for L and XL variants
if config.output_type:
cmd.extend(["--output-tensor-type", config.output_type])
if config.embedding_type:
cmd.extend(["--token-embedding-type", config.embedding_type])
# Add input, output, and quantisation type
cmd.extend([str(input_path), str(output_path), config.base_type])
return cmd
def _setup_environment(self) -> dict[str, str]:
"""Set up environment variables for quantisation command.
Returns:
Environment dictionary with necessary library paths.
"""
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
return env
def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]:
"""Process subprocess output stream and detect errors.
Returns:
Tuple of (output_lines, architecture_error_detected).
"""
output_lines = []
architecture_error = False
if process.stdout:
for line in iter(process.stdout.readline, ""):
if line:
cleaned_line = line.rstrip()
output_lines.append(cleaned_line)
logger.info(f" {cleaned_line}")
# Check for architecture errors
if any(
error_text in cleaned_line.lower()
for error_text in [
"unknown model architecture",
"unsupported architecture",
"unknown architecture",
"architecture not supported",
"model architecture",
"llama_model_load: error loading model",
]
):
architecture_error = True
return output_lines, architecture_error
def _handle_architecture_error(self, output_lines: list[str]) -> bool:
"""Handle architecture-related errors by checking output.
Returns:
True if architecture error was detected and handled.
"""
# Look for architecture info in recent output
for line in output_lines[-10:]: # Check last 10 lines
if "architecture" in line.lower():
logger.error("❌ Architecture not supported by llama.cpp")
logger.error(" so cannot be quantised with current llama.cpp but")
logger.error(" F16 GGUF file can be used for inference if supported")
# Store this for the orchestrator to detect
self.last_error = "unsupported_architecture"
return True
return False
def _execute_command(self, cmd: list[str]) -> bool:
"""Execute command with real-time output streaming.
Returns:
True if successful, False otherwise.
"""
try:
logger.info(f"🔧 Executing: {' '.join(cmd)}")
env = self._setup_environment()
# Execute with real-time output
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
universal_newlines=True,
env=env,
)
output_lines, architecture_error = self._process_output_stream(process)
return_code = process.poll()
if return_code == 0:
logger.info("✅ Quantisation successful!")
return True
# Check if this was an architecture error
if (architecture_error or return_code == 1) and self._handle_architecture_error(
output_lines
):
return False
logger.error(f"❌ Quantisation failed with return code {return_code}")
except Exception as e:
logger.error(f"❌ Quantisation failed with exception: {e}")
return False

View file

@ -25,38 +25,37 @@ class QuantisationType(StrEnum):
embeddings, attention layers, and feed-forward networks.
"""
# Q2 variants (smallest, lowest quality)
# Q2 variants
Q2_0 = "Q2_0" # Basic 2-bit quantisation (flat, no K-quant optimisations)
Q2_K = "Q2_K"
Q2_K_S = "Q2_K_S"
# Q3 K-quants
# Q3 variants
Q3_0 = "Q3_0" # Basic 3-bit quantisation (flat, no K-quant optimisations)
Q3_K_S = "Q3_K_S"
Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline)
Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)
# Q4 K-quants (most popular)
# Q4 variants
Q4_0 = "Q4_0" # Basic 4-bit quantisation (flat, no K-quant optimisations)
Q4_1 = "Q4_1"
Q4_K_S = "Q4_K_S"
Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
# Q5 K-quants
# Q5 variants
Q5_0 = "Q5_0" # Basic 5-bit quantisation (flat, no K-quant optimisations)
Q5_1 = "Q5_1"
Q5_K_S = "Q5_K_S"
Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
# Q6_K variants
# Q6 variants
Q6_0 = "Q6_0" # Basic 6-bit quantisation (flat, no K-quant optimisations)
Q6_K = "Q6_K"
Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)
# Q8_0 (highest common quantisation)
Q8_0 = "Q8_0"
# Legacy quantisation formats
Q4_0 = "Q4_0"
Q4_1 = "Q4_1"
Q5_0 = "Q5_0"
Q5_1 = "Q5_1"
# Q8 variants
Q8_0 = "Q8_0" # Basic 8-bit quantisation (flat, no K-quant optimisations)
Q8_K = "Q8_K" # K-quant 8-bit (optimised by llama.cpp)
# F16 variants
F16 = "F16" # F16 quantisation
class URLType(StrEnum):
@ -102,7 +101,12 @@ class QuantisationConfig(BaseModel):
Dictionary mapping layer types to quantisation specifications for display.
"""
# Build base quantisation string from precision
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
# For basic types (Q4_0, Q5_0, Q6_0, Q8_0), use the actual base_type
# For K-quants, build from precision
if self.base_type in {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}:
base = self.base_type
else:
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
# Get inherent enhancements for display - inherit from base type if this is L/XL variant
enhancements = self.inherent_enhancements or {}
@ -166,10 +170,9 @@ class QuantisationConfig(BaseModel):
== layers["gate_up"]
== layers["down"]
):
if self.name == "Q6_K":
return "Q6_K all layers"
if self.name == "Q8_0":
return "Q8_0 all layers"
# For basic types and uniform K-quants, use the actual name
if self.name in {"Q4_0", "Q5_0", "Q6_0", "Q8_0", "Q6_K", "Q8_K"}:
return f"{self.name} all layers"
return f"{layers['embed']} all layers"
# Build component groups

View file

@ -0,0 +1,23 @@
"""Quantisation orchestration and workflow management.
Provides high-level orchestration of the quantisation workflow,
including execution, progress tracking, and profile management.
"""
from __future__ import annotations
from helpers.quantisation.engine import QuantisationEngine
from helpers.quantisation.executor import QuantisationExecutor
from helpers.quantisation.model_manager import ModelManager
from helpers.quantisation.orchestrator import QuantisationOrchestrator
from helpers.quantisation.profile_manager import ProfileManager
from helpers.quantisation.progress import ProgressReporter
__all__ = [
"ModelManager",
"ProfileManager",
"ProgressReporter",
"QuantisationEngine",
"QuantisationExecutor",
"QuantisationOrchestrator",
]

View file

@ -0,0 +1,141 @@
"""Quantisation engine for model processing.
Handles the actual quantisation process with configurable methods,
supporting multiple quantisation backends and fallback strategies.
"""
from __future__ import annotations
import traceback
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.ggml import GGMLQuantiser
from helpers.llama_cpp import QuantisationExecutor
from helpers.logger import logger
from helpers.models.quantisation import QuantisationResult, QuantisationType
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import (
QuantisationContext,
)
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Uses direct llama.cpp binary execution with proper tensor overrides.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
self.executor = QuantisationExecutor()
self.ggml_quantiser = GGMLQuantiser()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation using direct llama.cpp binary with proper
tensor override flags for L and XL variants. Falls back to GGML
for basic types when architecture is unsupported. Processes the
quantisation context containing all required parameters and settings.
Returns:
QuantisationResult with success status and file information.
"""
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
# Check input file exists and is readable
if not context.f16_model_path.exists():
error_msg = f"Input model file does not exist: {context.f16_model_path}"
logger.error(f"{error_msg}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=error_msg,
)
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.info(f"📝 Source: {context.f16_model_path}")
logger.info(f"📝 Target: {output_path}")
# Determine if this is a basic type that can use GGML
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
is_basic_type = context.config.name in basic_types
try:
# Try llama.cpp first for all types
logger.info("🔧 Using llama.cpp binary for quantisation...")
success = self.executor.execute_quantisation(
context.f16_model_path, output_path, context.config, context.imatrix_path
)
if success:
return self._create_success_result(context.config.name, output_path, "llama.cpp")
# Check if this was an architecture error and we can use GGML fallback
if (
hasattr(self.executor, "last_error")
and self.executor.last_error == "unsupported_architecture"
and is_basic_type
):
logger.info("🔄 Architecture unsupported - using GGML implementation...")
success = self.ggml_quantiser.try_alternative_quantisation(
context.f16_model_path, output_path, context.config.name
)
if success:
return self._create_success_result(
context.config.name, output_path, "GGML numpy"
)
logger.error(f"{context.config.name} quantisation failed")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="Quantisation failed via Python API",
)
except Exception as e:
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=f"Exception during quantisation: {e!s}",
)
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Constructs a successful quantisation result containing file size
information and method details. Uses the quantisation type, output
path, and method information to generate comprehensive result metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)

View file

@ -0,0 +1,457 @@
"""Quantisation execution management.
Handles the execution of quantisation operations including parallel
uploads, status tracking, and error handling.
"""
from __future__ import annotations
import gc
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import TYPE_CHECKING, Any
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.logger import logger
from helpers.models.quantisation import (
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.quantisation.progress import ProgressReporter
from helpers.utils.rate_limiter import ReadmeRateLimiter
if TYPE_CHECKING:
from pathlib import Path
from helpers.filesystem import FileCleanup
from helpers.huggingface import HuggingFaceUploader
from helpers.models.quantisation import ModelSource
from helpers.quantisation.engine import QuantisationEngine
from helpers.readme import ReadmeGenerator
class QuantisationExecutor:
"""Executes quantisation operations with parallel upload support.
Manages the execution of multiple quantisations with background
uploads, status tracking, and proper error handling.
"""
def __init__(
self,
quantisation_engine: QuantisationEngine,
uploader: HuggingFaceUploader,
readme_generator: ReadmeGenerator,
file_cleanup: FileCleanup,
no_upload: bool = False,
) -> None:
"""Initialise quantisation executor.
Sets up the quantisation executor with all required service dependencies
for performing quantisations, uploading results, generating documentation,
and cleaning up temporary files. Configures upload behaviour based on settings.
"""
self.quantisation_engine = quantisation_engine
self.uploader = uploader
self.readme_generator = readme_generator
self.file_cleanup = file_cleanup
self.no_upload = no_upload
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
self.progress_reporter = ProgressReporter()
def execute_quantisations(
self,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
quantisation_types: list[QuantisationType],
models_dir: Path,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Orchestrates the complete quantisation workflow including F16 processing,
multiple quantisation type execution, parallel upload management, and
README generation. Handles all aspects of the quantisation pipeline
from initial setup through final documentation.
Returns:
Dictionary of quantisation results by type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
# Track F16 in results if we converted from SafeTensors
if not model_source.is_gguf_repo:
results[QuantisationType.F16] = self._create_f16_result(f16_model_path)
# Process with parallel uploads
upload_futures: list[Any] = []
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
# Start F16 upload if applicable
if (
not model_source.is_gguf_repo
and not self.no_upload
and QuantisationType.F16 in results
):
self._start_f16_upload(
results,
model_source,
output_repo,
f16_model_path,
upload_executor,
upload_futures,
)
# Process each quantisation
for i, quant_type in enumerate(quantisation_types, 1):
# Skip if already marked as failed
if quant_type in results and results[quant_type].status == "failed":
logger.info(
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
)
continue
self.progress_reporter.print_quantisation_start(
i, len(quantisation_types), quant_type.value
)
try:
result = self._process_single_quantisation(
quant_type,
model_source,
f16_model_path,
imatrix_path,
output_repo,
results,
models_dir,
upload_executor,
upload_futures,
)
results[quant_type] = result
# Force cleanup between quantisations
gc.collect()
except Exception as e:
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message=str(e),
)
# Force cleanup after error
gc.collect()
# Wait for all uploads to complete
self._wait_for_uploads(upload_futures)
# Final README update
if not self.no_upload and upload_futures:
self._final_readme_update(model_source, results, models_dir, output_repo)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult for the processed type.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
config = QUANTISATION_CONFIGS[quant_type]
# Create initial result and update status
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
self._update_readme_status(model_source, results, models_dir, output_repo)
# Perform quantisation
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=models_dir,
imatrix_path=imatrix_path,
)
result = self.quantisation_engine.quantise(context)
# Handle result
if result.success and result.file_path:
self._start_parallel_upload(
result,
quant_type,
output_repo,
model_source,
results,
models_dir,
upload_executor,
upload_futures,
)
else:
result.status = "failed"
self._update_readme_status(model_source, results, models_dir, output_repo)
except Exception as e:
logger.error(f"Error processing {quant_type.value}: {e}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(e)
try:
self._update_readme_status(model_source, results, models_dir, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _start_parallel_upload(
self,
result: QuantisationResult,
quant_type: QuantisationType,
output_repo: str,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Start parallel upload of quantisation result."""
if self.no_upload or not result.file_path:
return
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
models_dir,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
self._update_readme_status(model_source, results, models_dir, output_repo)
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
self.file_cleanup.cleanup_quantisation_file(file_path)
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
)
def _start_f16_upload(
self,
results: dict[QuantisationType, QuantisationResult],
model_source: ModelSource,
output_repo: str,
f16_model_path: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Start F16 upload in background."""
f16_result = results[QuantisationType.F16]
if f16_result.file_path and f16_result.file_path.exists():
logger.info("Starting parallel upload of F16 GGUF...")
f16_result.status = "uploading"
self._update_readme_status(
model_source, results, f16_model_path.parent.parent, output_repo
)
upload_future = upload_executor.submit(
self._upload_f16_and_cleanup,
output_repo,
f16_result.file_path,
model_source,
results,
f16_model_path.parent.parent,
)
upload_futures.append(upload_future)
def _upload_f16_and_cleanup(
self,
output_repo: str,
file_path: Path,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
) -> None:
"""Upload F16 file and update status (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
# Don't delete F16 yet - still needed for quantisations
results[QuantisationType.F16].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info("[PARALLEL] F16 upload complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
results[QuantisationType.F16].status = "failed"
results[QuantisationType.F16].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
)
def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult:
"""Create a result object for F16 tracking.
Returns:
QuantisationResult object for F16 tracking.
"""
f16_size = "-"
if f16_model_path.exists():
size_bytes = f16_model_path.stat().st_size
size_gb = size_bytes / (1024**3)
f16_size = f"{size_gb:.1f}GB"
# Create a simple result object for F16 tracking
return type(
"F16Result",
(),
{
"quantisation_type": "F16",
"success": True,
"status": "planned",
"file_path": f16_model_path,
"file_size": f16_size,
},
)()
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
if not upload_futures:
return
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
completed = 0
failed = 0
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
completed += 1
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
except Exception as e:
failed += 1
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
self.progress_reporter.print_upload_summary(completed, failed)
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Update README with current quantisation status using rate limiting."""
if not self.no_upload:
# Use rate limiter to batch updates
self.readme_limiter.request_update(
self._do_readme_update,
model_source,
results,
models_dir,
output_repo,
)
def _do_readme_update(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Actually perform the README update (called by rate limiter)."""
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _final_readme_update(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Perform final README update after all operations."""
logger.info("Updating README with final status...")
final_readme = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, final_readme)

View file

@ -0,0 +1,422 @@
"""Model acquisition and preparation management.
Handles model downloading from HuggingFace and preparation for quantisation,
including format detection and conversion.
"""
from __future__ import annotations
import shutil
import subprocess
import traceback
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.gguf import GGUFConverter
from helpers.logger import logger
from helpers.models.quantisation import ModelSource
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper
if TYPE_CHECKING:
from pathlib import Path
class ModelManager:
"""Handles model downloading and preparation for quantisation.
Manages both GGUF repository downloads and HuggingFace model conversions,
providing unified interface for model acquisition and preparation.
"""
def __init__(self, models_dir: Path) -> None:
"""Initialise model manager with storage configuration.
Creates a new model manager instance that will handle model downloading,
format detection, and preparation for quantisation workflows using the
specified directory as the base storage location.
"""
self.models_dir = models_dir
self.fs = FilesystemService()
def prepare_model(self, model_source: ModelSource) -> Path:
"""Prepare model for quantisation and return F16 model path.
Handles both GGUF repository downloads and regular HuggingFace model
conversion workflows with automatic format detection. Processes the
provided model source information to determine the optimal acquisition
strategy and ensures the model is in F16 GGUF format.
Returns:
Path to F16 GGUF model ready for quantisation.
"""
model_dir = self.models_dir / model_source.model_name
if model_source.is_gguf_repo:
return self._handle_gguf_repo(model_source, model_dir)
return self._handle_regular_repo(model_source, model_dir)
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Handle GGUF repository download with pattern matching.
Downloads GGUF files matching specified patterns, prioritising
multi-part files and F16 variants. Uses the model source information
and target directory to efficiently locate and download appropriate
GGUF files from HuggingFace repositories.
Returns:
Path to downloaded or existing GGUF file.
"""
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
return f16_model
# Check for existing GGUF files
model_dir.mkdir(parents=True, exist_ok=True)
existing_gguf = self.fs.find_gguf_files(model_dir)
if existing_gguf:
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
return existing_gguf[0]
# Download with patterns
downloaded_file = self._download_gguf_with_patterns(
model_source.source_model, model_source.gguf_file_pattern, model_dir
)
if downloaded_file:
# Handle multi-part files
if "00001-of-" in downloaded_file.name:
return downloaded_file
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
"-00003-of-", "-00001-of-"
)
first_part = downloaded_file.parent / base_name
if first_part.exists():
logger.info(f"🔄 Using first part: {first_part.name}")
return first_part
# Rename single file to standard name
downloaded_file.rename(f16_model)
return f16_model
# Fallback to regular conversion
logger.info("💡 Falling back to downloading full repository and converting...")
return self._handle_regular_repo(
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
model_dir,
)
def _download_gguf_with_patterns(
self, source_model: str, pattern: str | None, model_dir: Path
) -> Path | None:
"""Download GGUF file using various pattern strategies.
Tries multiple pattern variations to find and download appropriate
GGUF files, handling timeouts and temporary directories. Uses the
HuggingFace model identifier with an optional pattern to search for
specific files and downloads them to the target directory.
Returns:
Path to downloaded file, or None if all patterns fail.
"""
if pattern:
patterns = [
f"*{pattern}*",
f"*{pattern.lower()}*",
f"*{pattern.upper()}*",
"*f16*",
"*F16*",
"*fp16*",
]
else:
patterns = ["*f16*", "*F16*", "*fp16*"]
temp_dir = model_dir / "gguf_temp"
for search_pattern in patterns:
logger.info(f"🔍 Trying pattern: {search_pattern}")
temp_dir.mkdir(exist_ok=True)
try:
logger.debug(
f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
)
result = subprocess.run(
[
"timeout",
"300",
"huggingface-cli",
"download",
source_model,
"--include",
search_pattern,
"--local-dir",
str(temp_dir),
],
check=True,
capture_output=True,
text=True,
)
logger.debug(
f"DEBUG: Download command completed with return code {result.returncode}"
)
# Find downloaded GGUF files
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
if gguf_files:
found_file = gguf_files[0]
logger.info(f"✅ Found GGUF file: {found_file.name}")
# Move to parent directory
final_path = model_dir / found_file.name
shutil.move(str(found_file), str(final_path))
shutil.rmtree(temp_dir)
return final_path
except subprocess.CalledProcessError as e:
logger.debug(
f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
)
if e.stderr:
logger.debug(f"DEBUG: stderr: {e.stderr}")
if e.stdout:
logger.debug(f"DEBUG: stdout: {e.stdout}")
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
continue
except Exception as e:
logger.error(f"❌ Unexpected error during download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
continue
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return None
def _handle_regular_repo(
self,
model_source: ModelSource,
model_dir: Path,
) -> Path:
"""Handle regular HuggingFace repository conversion.
Downloads full model repository and converts to F16 GGUF format
using our native Python-based GGUFConverter for SafeTensors models.
Processes the model source information and uses the local directory
for storage during the download and conversion workflow.
Returns:
Path to converted F16 GGUF model.
"""
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
# Download model if needed
if not model_dir.exists():
self._download_repository(model_source.source_model, model_dir)
else:
logger.info("✅ Model already downloaded")
# Convert to GGUF
return self._convert_to_gguf(model_source, model_dir)
def _setup_download_directories(self, model_dir: Path) -> None:
"""Set up directories for model download.
Creates the necessary directory structure for model downloads,
including the base model directory and HuggingFace metadata
directory to ensure proper organisation of downloaded assets.
"""
model_dir.mkdir(parents=True, exist_ok=True)
huggingface_dir = model_dir / ".huggingface"
huggingface_dir.mkdir(parents=True, exist_ok=True)
def _create_download_process(self, source_model: str, model_dir: Path) -> subprocess.Popen:
"""Create subprocess for downloading repository.
Initiates a HuggingFace CLI download process for the specified model
identifier, configuring it to download to the local directory whilst
excluding existing GGUF files to avoid conflicts.
Returns:
Subprocess for downloading.
"""
return subprocess.Popen(
[
"huggingface-cli",
"download",
source_model,
"--local-dir",
str(model_dir),
"--exclude",
"*.gguf",
],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1, # Line buffered
universal_newlines=True,
)
def _stream_download_output(self, process: subprocess.Popen) -> None:
"""Stream download process output with appropriate logging levels.
Monitors the download subprocess output and routes progress information
to appropriate log levels, providing real-time feedback on download
progress whilst filtering debug information appropriately.
"""
if process.stdout:
for line in process.stdout:
# Log download progress lines
if line.strip():
# Check if it's a progress line (contains %)
if "%" in line or "Downloading" in line or "Fetching" in line:
# Use info level for progress lines
logger.info(f" {line.strip()}")
else:
# Use debug for other output
logger.debug(f" {line.strip()}")
def _handle_download_errors(self, source_model: str, e: Exception) -> None:
"""Handle download errors with detailed logging.
Processes download exceptions for the specified model, providing
comprehensive error logging including return codes, stderr, and
stdout information to aid in debugging download failures.
Raises:
TypeError: Always raised with appropriate error message.
"""
if isinstance(e, subprocess.CalledProcessError):
logger.error(f"❌ Failed to download repository {source_model}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"Repository download failed: {e}"
raise TypeError(msg) from e
logger.error(f"❌ Unexpected error during repository download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
msg = f"Repository download failed: {e}"
raise TypeError(msg) from e
def _download_repository(self, source_model: str, model_dir: Path) -> None:
"""Download HuggingFace repository.
Orchestrates the complete repository download workflow for the
specified HuggingFace model, managing directory setup, process
execution, and error handling to ensure robust model acquisition.
Raises:
RuntimeError: If download fails.
"""
self._setup_download_directories(model_dir)
try:
logger.info(f"⬇️ Downloading full repository: {source_model}")
logger.info("📊 Progress will be shown below...")
process = self._create_download_process(source_model, model_dir)
self._stream_download_output(process)
# Wait for process to complete
return_code = process.wait()
if return_code != 0:
msg = f"Repository download failed with return code {return_code}"
raise RuntimeError(msg)
logger.info("✅ Repository download completed successfully")
except Exception as e:
self._handle_download_errors(source_model, e)
def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Convert model to GGUF F16 format.
Converts SafeTensors models to GGUF F16 format using our native
Python converter. Processes model source information and the
directory containing downloaded model files, handling architecture
detection and tensor mapping for optimal compatibility.
Returns:
Path to F16 GGUF model.
Raises:
RuntimeError: If conversion fails.
"""
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info("✅ F16 model already exists")
return f16_model
# Check for SafeTensors files
safetensor_files = list(model_dir.glob("*.safetensors"))
if not safetensor_files:
logger.error("❌ Model format not supported")
logger.info("💡 This tool supports GGUF and SafeTensors formats")
msg = "Model must be in GGUF or SafeTensors format"
raise RuntimeError(msg)
logger.info("🐍 Using native Python GGUFConverter...")
logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
# Load model configuration
config_parser = ConfigParser()
model_config = config_parser.load_model_config(model_dir)
# Get architecture mapping
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
arch = config_parser.get_architecture_mapping(arch_name)
if arch != arch_name:
logger.info(f"📝 Architecture mapping: {arch_name}{arch}")
# Check if architecture is supported by llama.cpp
supported_archs = {
"llama",
"qwen2",
"gemma",
"phi3",
"falcon",
"gpt2",
"gptj",
"gptneox",
"mpt",
"baichuan",
"stablelm",
}
if arch not in supported_archs:
logger.warning("=" * 70)
logger.warning(f"⚠️ Architecture '{arch_name}' may not be supported by llama.cpp")
logger.warning(f"⚠️ The GGUF will be created with architecture: '{arch}'")
logger.warning("⚠️ Check if your inference software supports this architecture.")
logger.warning("=" * 70)
# Convert using GGUFConverter
tensor_mapper = TensorMapper()
success = GGUFConverter.convert_safetensors(
model_dir, f16_model, model_config, arch, tensor_mapper
)
if not success:
logger.error("❌ Native Python conversion failed")
msg = "Failed to convert SafeTensors model to GGUF"
raise RuntimeError(msg)
logger.info("✅ Native Python conversion successful")
return f16_model

View file

@ -0,0 +1,229 @@
"""Main quantisation orchestrator.
Provides the high-level orchestration of the complete quantisation
workflow, coordinating between various services and modules.
"""
from __future__ import annotations
import signal
import sys
import traceback
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.filesystem import FileCleanup, WorkspaceManager
from helpers.huggingface import HuggingFaceUploader
from helpers.llama_cpp import IMatrixGenerator, IMatrixHandler
from helpers.logger import logger
from helpers.models.quantisation import QuantisationResult, QuantisationType
from helpers.quantisation.engine import QuantisationEngine
from helpers.quantisation.executor import QuantisationExecutor
from helpers.quantisation.model_manager import ModelManager
from helpers.quantisation.profile_manager import ProfileManager
from helpers.quantisation.progress import ProgressReporter
from helpers.readme import ReadmeGenerator
from helpers.utils.rate_limiter import ReadmeRateLimiter
from helpers.utils.tensor_mapping import URLParser
if TYPE_CHECKING:
from types import FrameType
from helpers.models.quantisation import ModelSource
@dataclass(slots=True)
class QuantisationOrchestrator:
"""Orchestrates the complete quantisation workflow.
Thin coordinator that delegates to specialised services for
each aspect of the quantisation workflow.
"""
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
use_imatrix: bool = True
no_upload: bool = False
custom_profiles: list[str] | None = None
# Service dependencies
url_parser: URLParser = field(default_factory=URLParser)
workspace_manager: WorkspaceManager = field(init=False)
model_manager: ModelManager = field(init=False)
profile_manager: ProfileManager = field(default_factory=ProfileManager)
progress_reporter: ProgressReporter = field(default_factory=ProgressReporter)
quantisation_executor: QuantisationExecutor = field(init=False)
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
file_cleanup: FileCleanup = field(default_factory=FileCleanup)
readme_limiter: ReadmeRateLimiter = field(init=False)
def __post_init__(self) -> None:
"""Initialise computed properties after dataclass construction."""
self.workspace_manager = WorkspaceManager(self.work_dir)
self.model_manager = ModelManager(self.workspace_manager.models_dir)
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
# Create executor with dependencies
self.quantisation_executor = QuantisationExecutor(
quantisation_engine=QuantisationEngine(),
uploader=self.uploader,
readme_generator=self.readme_generator,
file_cleanup=self.file_cleanup,
no_upload=self.no_upload,
)
# Set up signal handlers
self._setup_signal_handlers()
def _setup_signal_handlers(self) -> None:
"""Set up signal handlers to catch unexpected exits."""
def signal_handler(signum: int, frame: FrameType | None) -> None:
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
logger.error("Stack trace at signal:")
if frame:
for line in traceback.format_stack(frame):
logger.error(f" {line.strip()}")
logger.error("Exiting due to signal")
sys.exit(1)
# Handle common termination signals
for sig in [signal.SIGINT, signal.SIGTERM]:
signal.signal(sig, signal_handler)
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
"""Main quantisation workflow orchestrating model processing from URL to upload.
Coordinates the complete quantisation process from URL parsing through
model downloading, quantisation execution, and upload to HuggingFace.
Handles architecture compatibility and provides comprehensive error handling.
Returns:
Dictionary of quantisation results by type.
Raises:
KeyboardInterrupt: If the user interrupts the quantisation process.
"""
logger.info("Starting Bartowski quantisation process...")
logger.debug(f"DEBUG: Input URL: {url}")
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
logger.debug(f"DEBUG: No upload: {self.no_upload}")
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
try:
# Setup and preparation
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
# Create initial repository
self._create_initial_repository(model_source, output_repo)
# Get quantisation types
quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles)
# Filter by architecture if needed
supported_types, unsupported_types = self.profile_manager.filter_by_architecture(
quantisation_types, f16_model_path
)
# Pre-mark unsupported types
results: dict[QuantisationType, QuantisationResult] = {}
for quant_type in unsupported_types:
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message="K-quant requires llama.cpp architecture support",
)
# Execute quantisations
execution_results = self.quantisation_executor.execute_quantisations(
model_source,
f16_model_path,
imatrix_path,
output_repo,
supported_types,
self.workspace_manager.models_dir,
)
results.update(execution_results)
# Cleanup
self.file_cleanup.cleanup_files(
f16_model_path, model_source, self.workspace_manager.models_dir
)
# Print summary
self.progress_reporter.print_completion_summary(model_source, results, output_repo)
except KeyboardInterrupt:
logger.error("❌ Process interrupted by user (Ctrl+C)")
raise
except Exception as e:
logger.error(f"❌ Critical error in quantisation workflow: {e}")
logger.error("Full traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
finally:
# Always flush pending README updates before exiting
self.readme_limiter.flush()
return results
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
"""Setup environment and prepare model for quantisation.
Returns:
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
"""
model_source = self.url_parser.parse(url)
self.progress_reporter.print_model_info(
model_source, self.uploader.get_username(), str(self.work_dir)
)
f16_model_path = self.model_manager.prepare_model(model_source)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
model_dir = self.workspace_manager.get_model_dir(model_source.model_name)
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
# If no imatrix found, offer to generate or provide one
if not imatrix_path:
# First offer to generate
imatrix_path = self.imatrix_generator.prompt_for_generation(
model_source, model_dir, f16_model_path
)
# If generation was skipped, offer to provide existing one
if not imatrix_path:
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
return model_source, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
"""Create initial repository with planned quantisations."""
logger.info("Creating initial README with planned quantisations...")
quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles)
planned_results = {
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
for qt in quantisation_types
}
readme_path = self.readme_generator.generate(
model_source, planned_results, self.workspace_manager.models_dir, output_repo
)
if not self.no_upload:
logger.info("Creating repository with planned quantisations...")
self.uploader.upload_readme(output_repo, readme_path)
else:
logger.info("Skipping repository creation (--no-upload specified)")

View file

@ -0,0 +1,132 @@
"""Quantisation profile management.
Manages selection and validation of quantisation types based on
user preferences, architecture support, and configuration.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from helpers.config.quantisation_configs import (
DEFAULT_QUANTISATION_TYPES,
SUPPORTED_QUANTISATION_TYPES,
)
from helpers.llama_cpp.architecture import ArchitectureDetector
from helpers.logger import logger
from helpers.models.quantisation import QuantisationType
if TYPE_CHECKING:
from pathlib import Path
class ProfileManager:
"""Manages quantisation profiles and type selection.
Handles selection of quantisation types based on custom profiles,
architecture support, and fallback to defaults.
"""
@staticmethod
def get_quantisation_types(
custom_profiles: list[str] | None = None,
) -> list[QuantisationType]:
"""Get the quantisation types to use for this run.
Determines which quantisation types should be processed based on
custom profiles provided by the user, or falls back to default
configurations if no custom profiles are specified.
Returns:
List of QuantisationType enums to process.
"""
if custom_profiles:
return ProfileManager._parse_custom_profiles(custom_profiles)
return DEFAULT_QUANTISATION_TYPES
@staticmethod
def _parse_custom_profiles(profile_strings: list[str]) -> list[QuantisationType]:
"""Parse custom profile strings to QuantisationType enums.
Validates and converts user-provided profile strings into proper
QuantisationType enumerations, filtering out invalid or unsupported
types whilst logging warnings for problematic entries.
Returns:
List of valid QuantisationType enums.
"""
result = []
for profile_str in profile_strings:
try:
profile = QuantisationType(profile_str.upper())
if profile in SUPPORTED_QUANTISATION_TYPES:
result.append(profile)
else:
logger.warning(f"Profile {profile_str} is not supported, skipping")
except ValueError:
logger.warning(f"Invalid profile {profile_str}, skipping")
# Fall back to defaults if no valid profiles
return result or DEFAULT_QUANTISATION_TYPES
@staticmethod
def filter_by_architecture(
quantisation_types: list[QuantisationType],
f16_model_path: Path,
) -> tuple[list[QuantisationType], list[QuantisationType]]:
"""Filter quantisation types based on architecture support.
Analyses the F16 GGUF model to determine architecture compatibility
and filters the requested quantisation types accordingly. Separates
supported types from unsupported ones, especially filtering K-quants
for architectures not supported by llama.cpp.
Returns:
Tuple of (supported_types, unsupported_types).
"""
if not ArchitectureDetector.check_architecture_support(f16_model_path):
# Architecture not supported - filter out K-quants
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
supported = []
unsupported = []
for quant_type in quantisation_types:
if quant_type.value in basic_types:
supported.append(quant_type)
else:
unsupported.append(quant_type)
if unsupported:
logger.warning(
"⚠️ Architecture not supported by llama.cpp - K-quants will be skipped"
)
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
return supported, unsupported
# All types supported
return quantisation_types, []
@staticmethod
def validate_profiles(profiles: list[str]) -> list[str]:
"""Validate a list of profile strings.
Checks each profile string to ensure it corresponds to a valid
and supported quantisation type, logging warnings for invalid
entries whilst returning only the valid profile strings.
Returns:
List of valid profile strings.
"""
valid = []
for profile in profiles:
try:
quant_type = QuantisationType(profile.upper())
if quant_type in SUPPORTED_QUANTISATION_TYPES:
valid.append(profile)
else:
logger.warning(f"Profile {profile} exists but is not supported")
except ValueError:
logger.warning(f"Profile {profile} is not a valid quantisation type")
return valid

View file

@ -0,0 +1,151 @@
"""Progress tracking and reporting for quantisation workflow.
Provides utilities for tracking quantisation progress, generating
status reports, and displaying completion summaries.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from helpers.logger import logger
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource, QuantisationResult, QuantisationType
class ProgressReporter:
"""Reports progress and status of quantisation operations.
Provides methods for displaying model information, progress updates,
and completion summaries throughout the quantisation workflow.
"""
@staticmethod
def print_model_info(model_source: ModelSource, username: str, work_dir: str) -> None:
"""Print model information at start of processing.
Displays comprehensive information about the model being processed,
including source details, author information, and working directory
to provide clear context at the beginning of quantisation workflows.
"""
logger.info(f"Source URL: {model_source.url}")
logger.info(f"Source model: {model_source.source_model}")
logger.info(f"Original author: {model_source.original_author}")
logger.info(f"Model name: {model_source.model_name}")
logger.info(f"Your HF username: {username}")
logger.info(f"Working directory: {work_dir}")
@staticmethod
def print_quantisation_start(
index: int,
total: int,
quant_type: str,
) -> None:
"""Print message when starting a quantisation.
Displays progress information showing which quantisation is currently
being processed within the overall batch, providing clear feedback
about workflow advancement and the specific type being quantised.
"""
logger.info(f"Processing quantisation {index}/{total}: {quant_type}")
@staticmethod
def print_completion_summary(
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Print completion summary with results.
Generates comprehensive completion report showing successful quantisations,
file information, and repository links. Provides detailed feedback on
the overall quantisation workflow outcome and model availability.
"""
successful_results = [r for r in results.values() if r.success]
if successful_results:
logger.info("Complete! Your quantised models are available at:")
logger.info(f" https://huggingface.co/{output_repo}")
logger.info("Model info:")
logger.info(f" - Source URL: {model_source.url}")
logger.info(f" - Original: {model_source.source_model}")
logger.info(
" - Method: "
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
)
logger.info(f" - Quantised: {output_repo}")
for result in successful_results:
if result.file_size:
filename = (
f"{model_source.original_author}-{model_source.model_name}-"
f"{result.quantisation_type}.gguf"
)
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
else:
logger.error(
"All quantisations failed - repository created with documentation "
"but no model files"
)
logger.error(f" Repository: https://huggingface.co/{output_repo}")
@staticmethod
def print_upload_summary(completed: int, failed: int) -> None:
"""Print upload completion summary.
Reports the final upload statistics showing successful and failed
uploads with appropriate warning or success messaging based on
the outcome of the upload batch process.
"""
if failed > 0:
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
else:
logger.info(f"All {completed} uploads completed successfully")
@staticmethod
def print_architecture_warning() -> None:
"""Print warning about unsupported architecture."""
logger.warning("⚠️ Architecture not supported by llama.cpp - K-quants will be skipped")
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
@staticmethod
def get_status_emoji(status: str) -> str:
"""Get emoji for a given status.
Maps status strings to appropriate emoji representations for enhanced
visual feedback in progress reporting. Provides a default emoji for
unknown status values to maintain consistent display formatting.
Returns:
Appropriate emoji for the status.
"""
status_emojis = {
"planned": "📋",
"processing": "⚙️",
"uploading": "📤",
"completed": "",
"failed": "",
}
return status_emojis.get(status, "")
@staticmethod
def format_progress_bar(current: int, total: int, width: int = 30) -> str:
"""Format a text progress bar.
Creates a visual progress representation using Unicode block characters
with percentage display. Handles edge cases like zero totals and
calculates appropriate fill ratios for the specified width.
Returns:
Formatted progress bar string.
"""
if total == 0:
return "[" + " " * width + "]"
progress = int((current / total) * width)
filled = "" * progress
empty = "" * (width - progress)
percentage = (current / total) * 100
return f"[{filled}{empty}] {percentage:.1f}%"

View file

@ -0,0 +1,23 @@
"""README generation for quantised models.
Provides utilities for generating comprehensive documentation including
model cards, quantisation tables, and status tracking.
"""
from __future__ import annotations
from helpers.readme.formatter import (
FileSizeFormatter,
StatusFormatter,
TableFormatter,
TagFormatter,
)
from helpers.readme.generator import ReadmeGenerator
__all__ = [
"FileSizeFormatter",
"ReadmeGenerator",
"StatusFormatter",
"TableFormatter",
"TagFormatter",
]

265
helpers/readme/formatter.py Normal file
View file

@ -0,0 +1,265 @@
"""README formatting utilities.
Provides formatters for status indicators, tables, and other README elements.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.models.quantisation import QuantisationResult, QuantisationType
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import ModelSource
# File size constant
GIBIBYTE = 1024**3
class StatusFormatter:
"""Formats status indicators for README tables."""
@staticmethod
def format_status(
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format status indicator for README table.
Creates appropriate status indicator based on quantisation state
including progress indicators, file sizes, and download links.
Returns:
Formatted status string for table cell.
"""
status_map = {
"planned": "⏳ Queued",
"processing": "🔄 Processing...",
"uploading": "⬆️ Uploading...",
"failed": "❌ Failed",
}
if hasattr(result, "status") and result.status in status_map:
base_status = status_map[result.status]
# Check for architecture not supported error
if (
result.status == "failed"
and hasattr(result, "error_message")
and result.error_message
and "architecture not supported" in str(result.error_message).lower()
):
return "⚠️ Skipped"
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
return f"{base_status} ({result.file_size})"
if result.status == "completed" or (hasattr(result, "success") and result.success):
return StatusFormatter.format_success_status(
result, model_source, quant_type, output_repo
)
return base_status
# Legacy support
if hasattr(result, "success") and result.success:
return StatusFormatter.format_success_status(
result, model_source, quant_type, output_repo
)
return "❌ Failed"
@staticmethod
def format_success_status(
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format successful quantisation status with download link.
Creates a download link if repository information is available,
otherwise shows file size.
Returns:
Formatted success status string.
"""
if not output_repo:
return (
f"{result.file_size}"
if hasattr(result, "file_size") and result.file_size
else "✅ Available"
)
filename = (
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
)
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
if hasattr(result, "file_size") and result.file_size:
return f"[✅ {result.file_size}]({url})"
return f"[✅ Available]({url})"
class TableFormatter:
"""Formats quantisation tables for README."""
@staticmethod
def get_ordered_quantisation_types() -> list[QuantisationType]:
"""Get quantisation types in display order.
Returns types ordered by precision level and variant.
Returns:
Ordered list of quantisation types.
"""
return [
# Q3 K-quants
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 types
QuantisationType.Q4_0, # Basic
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 types
QuantisationType.Q5_0, # Basic
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6 types
QuantisationType.Q6_0, # Basic
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
# Q8 types
QuantisationType.Q8_0, # Basic
QuantisationType.Q8_K,
]
@staticmethod
def format_quantisation_row(
quant_type: QuantisationType,
result: QuantisationResult | None,
model_source: ModelSource,
output_repo: str | None,
) -> str:
"""Format a single quantisation table row.
Creates a formatted table row for the README displaying quantisation
type, configuration details, and status information. Handles cases
where no result is available by creating a default planned result.
Returns:
Formatted table row string.
"""
# Create default result if none exists
if result is None:
result = QuantisationResult(
quantisation_type=quant_type, success=False, status="planned"
)
# Get configuration
config = QUANTISATION_CONFIGS.get(quant_type)
# Format status
status_formatter = StatusFormatter()
status = status_formatter.format_status(result, model_source, quant_type, output_repo)
# Get configuration description
config_desc = (
config.get_compact_config(QUANTISATION_CONFIGS)
if config
else f"{quant_type} all layers"
)
return f"| **{quant_type.value}** | {config_desc} | {status} |\n"
class TagFormatter:
"""Formats tags for README frontmatter."""
@staticmethod
def build_tags(
results: dict[QuantisationType, QuantisationResult],
original_tags: list[str] | None = None,
) -> list[str]:
"""Build tags based on quantisation results.
Generates appropriate tags for the model repository based on
successful quantisations and combines them with any original
tags from the source model to create a comprehensive tag list.
Returns:
Sorted list of unique tags.
"""
our_tags = ["gguf"]
# Add tags for successful quantisations
for quant_type, result in results.items():
if hasattr(result, "status") and result.status == "completed":
if quant_type == QuantisationType.F16:
our_tags.append("f16")
elif hasattr(result, "quantisation_type"):
# Convert to lowercase tag format
our_tags.append(result.quantisation_type.value.lower())
# Check for F16 availability
if (
len(our_tags) == 1
and QuantisationType.F16 in results
and hasattr(results[QuantisationType.F16], "status")
and results[QuantisationType.F16].status in {"completed", "uploading"}
):
our_tags.append("f16")
# Combine with original tags
all_tags = our_tags
if original_tags:
all_tags = sorted(set(our_tags + original_tags))
return all_tags
class FileSizeFormatter:
"""Formats file sizes for display."""
@staticmethod
def format_size_bytes(size_bytes: int) -> str:
"""Format bytes to human-readable size.
Converts raw byte values into human-readable format using appropriate
units (B, KB, MB, GB) with decimal precision for larger values to
provide clear file size information in documentation.
Returns:
Formatted size string (e.g., "4.5GB").
"""
if size_bytes < 1024:
return f"{size_bytes}B"
if size_bytes < 1024**2:
return f"{size_bytes / 1024:.1f}KB"
if size_bytes < GIBIBYTE:
return f"{size_bytes / (1024**2):.1f}MB"
return f"{size_bytes / GIBIBYTE:.1f}GB"
@staticmethod
def get_file_size(file_path: Path) -> str:
"""Get formatted file size from path.
Retrieves file size information from the filesystem and formats
it into human-readable format. Handles non-existent files gracefully
by returning a placeholder string for missing files.
Returns:
Formatted size string or "-" if file doesn't exist.
"""
if not file_path.exists():
return "-"
size_bytes = file_path.stat().st_size
return FileSizeFormatter.format_size_bytes(size_bytes)

311
helpers/readme/generator.py Normal file
View file

@ -0,0 +1,311 @@
"""README generation for quantised models.
Coordinates README creation by combining templates, formatting, and
original model information.
"""
from __future__ import annotations
import json
import re
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.models.quantisation import QuantisationType
from helpers.readme.formatter import (
FileSizeFormatter,
TableFormatter,
TagFormatter,
)
from helpers.readme.templates import (
get_f16_row_template,
get_frontmatter_template,
get_header_template,
get_original_model_section,
get_quantisation_info,
)
from helpers.utils.config_parser import ConfigParser
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import ModelSource, QuantisationResult
# File size constant
GIBIBYTE = 1024**3
class ReadmeGenerator:
"""Generates README files for quantised models.
Creates comprehensive README documentation including model cards,
quantisation details, and status tracking. Supports both initial
planning documentation and final result summaries.
"""
def generate(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str | None = None,
) -> Path:
"""Generate README file for quantised model repository.
Creates a comprehensive README with frontmatter, quantisation table,
and original model information. Handles status tracking for planned,
processing, and completed quantisations.
Returns:
Path to generated README file.
"""
logger.info("Creating model card...")
model_dir = models_dir / model_source.model_name
readme_path = model_dir / "README.md"
# Get original README content
original_content = self._get_original_readme(model_source, model_dir)
# Generate new README
readme_content = self._generate_readme_content(
model_source, results, original_content, output_repo, models_dir
)
readme_path.write_text(readme_content)
return readme_path
def _get_architecture(self, model_dir: Path) -> str | None:
"""Get the architecture from the model's config.json.
Returns:
Architecture name or None if not found.
"""
config_path = model_dir / "config.json"
if not config_path.exists():
return None
try:
with config_path.open(encoding="utf-8") as f:
config = json.load(f)
# Get the architectures field - it's a list
architectures = config.get("architectures", [])
if architectures:
arch_name = architectures[0]
# Get the mapped architecture (what it will be converted to)
parser = ConfigParser()
mapped_arch = parser.get_architecture_mapping(arch_name)
logger.info(f"Architecture: {arch_name} -> {mapped_arch}")
return mapped_arch
except Exception as e:
logger.warning(f"Could not determine architecture: {e}")
return None
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
"""Extract original README and metadata.
Downloads or reads the original model's README for inclusion in the
quantised model documentation. Parses YAML frontmatter if present.
Returns:
Dictionary with readme content, licence, tags, and frontmatter.
"""
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
# Check for preserved original README first
original_readme_path = model_dir / "README.original.md"
readme_path = model_dir / "README.md"
if original_readme_path.exists():
# Use the preserved original
content["readme"] = original_readme_path.read_text(encoding="utf-8")
logger.info(f"Found preserved original README ({len(content['readme'])} characters)")
elif readme_path.exists():
# First time - preserve the original and use it
readme_content = readme_path.read_text(encoding="utf-8")
# Check if this is already our generated README
if (
f"{model_source.original_author}-{model_source.model_name}-GGUF"
not in readme_content
):
# This is the original - preserve it
original_readme_path.write_text(readme_content)
content["readme"] = readme_content
logger.info(f"Preserved original README ({len(readme_content)} characters)")
else:
# This is our README, try to extract original content
logger.info("Found existing generated README, extracting original content")
# Try to find the separator
separator_idx = readme_content.find("\n---\n\n## Original Model Information\n")
if separator_idx > 0:
content["readme"] = readme_content[separator_idx + 37 :]
else:
logger.info("No README found to preserve")
# Parse frontmatter if we have content
if content["readme"]:
parsed = self._parse_frontmatter(content["readme"])
content.update(parsed)
return content
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
"""Parse YAML frontmatter from README.
Extracts metadata from YAML frontmatter including licence, tags,
and other model card fields.
Returns:
Dictionary with separated content and metadata.
"""
lines = readme_text.split("\n")
if lines[0] != "---":
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter_end = -1
for i, line in enumerate(lines[1:], 1):
if line == "---":
frontmatter_end = i
break
if frontmatter_end == -1:
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter = "\n".join(lines[1:frontmatter_end])
content = "\n".join(lines[frontmatter_end + 1 :])
# Extract licence
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
# Extract tags
tags = []
in_tags = False
for line in frontmatter.split("\n"):
if line.startswith("tags:"):
in_tags = True
continue
if in_tags:
if line.startswith("- "):
tags.append(line[2:].strip())
elif line and not line.startswith(" "):
break
return {
"readme": content,
"licence": licence_val,
"tags": ",".join(tags),
"frontmatter": frontmatter,
}
def _generate_readme_content(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
original_content: dict[str, str],
output_repo: str | None = None,
models_dir: Path | None = None,
) -> str:
"""Generate complete README content with quantisation details.
Creates the full README including YAML frontmatter, quantisation status
table, and original model information.
Returns:
Complete README markdown content.
"""
# Build tags
tag_formatter = TagFormatter()
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
all_tags = tag_formatter.build_tags(results, original_tags)
# Build frontmatter
content = get_frontmatter_template(
original_content["licence"],
model_source.source_model,
all_tags,
)
# Add header
content += get_header_template(
model_source.original_author,
model_source.model_name,
model_source.source_model,
)
# Add quantisation table
table_formatter = TableFormatter()
for quant_type in table_formatter.get_ordered_quantisation_types():
result = results.get(quant_type)
content += table_formatter.format_quantisation_row(
quant_type, result, model_source, output_repo
)
# Add F16 row if applicable
if not model_source.is_gguf_repo and output_repo:
content += self._format_f16_row(model_source, results, output_repo, models_dir)
# Add quantisation information
content += get_quantisation_info()
# Add original model section if available
if original_content.get("readme"):
content += get_original_model_section(original_content["readme"])
return content
def _format_f16_row(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
models_dir: Path | None = None,
) -> str:
"""Format F16 GGUF row for the table.
Creates a properly formatted F16 reference row for the quantisation
table using source model information, results data, and repository
details with optional models directory for file size calculation.
Returns:
Formatted F16 table row.
"""
# Get F16 result from results dict
f16_result = results.get(QuantisationType.F16)
# Get file size
f16_size = "-"
if f16_result and hasattr(f16_result, "file_size"):
f16_size = f16_result.file_size or "-"
elif models_dir:
# Try to get from actual file
f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
f16_path = models_dir / model_source.model_name / f16_filename
if f16_path.exists():
f16_size = FileSizeFormatter.get_file_size(f16_path)
# Get status
status = "planned"
if f16_result and hasattr(f16_result, "status"):
status = f16_result.status
return get_f16_row_template(
model_source.original_author,
model_source.model_name,
output_repo,
f16_size,
status,
)

228
helpers/readme/templates.py Normal file
View file

@ -0,0 +1,228 @@
"""README templates for quantised models.
Provides template strings and builders for generating README documentation.
"""
from __future__ import annotations
def get_frontmatter_template(
licence: str,
base_model: str,
tags: list[str],
) -> str:
"""Generate YAML frontmatter for README.
Creates the YAML metadata header for HuggingFace model cards including
licence information, library specification, base model reference, and
tag listings formatted according to HuggingFace conventions.
Returns:
Formatted YAML frontmatter string.
"""
frontmatter = f"""---
license: {licence}
library_name: gguf
base_model: {base_model}
tags:
"""
for tag in tags:
if tag.strip():
frontmatter += f"- {tag.strip()}\n"
frontmatter += "---\n\n"
return frontmatter
def get_header_template(
original_author: str,
model_name: str,
source_model: str,
) -> str:
"""Generate README header section.
Creates the main header section with model title, description of the
quantisation process, and initial table structure for displaying
quantisation variants and their status information.
Returns:
Formatted header markdown.
"""
hf_url = f"https://huggingface.co/{source_model}"
return f"""# {original_author}-{model_name}-GGUF
GGUF quantisations of [{source_model}]({hf_url}) using
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
which replicates Bartowski's quantisation profiles.
| Variant | Configuration | Status |
|---|---|---|
"""
def get_downloads_section(download_instruction: str | None = None) -> str:
"""Generate downloads and usage section.
Creates comprehensive usage documentation including download instructions,
quick start examples for various runtimes (llama.cpp, Ollama, LM Studio),
and integration guidance with optional custom instructions.
Returns:
Formatted downloads section markdown.
"""
base_section = """
## 📥 Download Links
Direct download links are available for each quantisation in the table above. Click the status to
go to the file page.
## 🚀 Quick Start
### Using llama.cpp
```bash
# Download the model (replace Q4_K_M with your chosen quantisation)
wget https://huggingface.co/YOUR_REPO/resolve/main/model-Q4_K_M.gguf
# Run with llama.cpp
./llama-cli -m model-Q4_K_M.gguf -p "Your prompt here"
```
### Using Ollama
```bash
# Create Modelfile
echo "FROM ./model-Q4_K_M.gguf" > Modelfile
# Create and run the model
ollama create mymodel -f Modelfile
ollama run mymodel
```
### Using LM Studio
1. Open LM Studio
2. Click "Download Model"
3. Paste the HuggingFace repository URL
4. Select your preferred quantisation
5. Click Download
"""
if download_instruction:
base_section = f"{download_instruction}\n\n{base_section}"
return base_section
def get_quantisation_info() -> str:
"""Get information about quantisation types.
Returns:
Formatted quantisation information markdown.
"""
return """
## 📊 Quantisation Information
### Bartowski Naming Convention
- **L variants** (Q3_K_L, Q4_K_L, Q5_K_L): Uses Q8_0 for embeddings/output weights
- **M variants** (Q3_K_M, Q4_K_M, Q5_K_M): Standard K-quant configuration
- **XL variant** (Q3_K_XL): Q8_0 embeddings + Q6_K output weights
- **_L suffix** (Q6_K_L): Q8_0 for output.weight tensor
### Recommended Quantisations
- **Q4_K_M**: Best balance of quality and size (4.58GB for 7B model)
- **Q5_K_M**: Higher quality, larger size (5.33GB for 7B model)
- **Q3_K_L**: Smallest with good quality (3.35GB for 7B model)
- **Q6_K_L**: Near original quality (5.65GB for 7B model)
- **Q8_0**: Highest quality quantisation (7.17GB for 7B model)
### Basic vs K-quants
- **Basic types** (Q4_0, Q5_0, Q6_0, Q8_0): Simple quantisation, universally compatible
- **K-quants** (Q#_K_*): Advanced quantisation with better quality/size ratios
Choose K-quants when available for better performance. Basic types are fallbacks for unsupported
architectures.
"""
def get_original_model_section(
original_readme: str,
separator: str = "---",
) -> str:
"""Format original model documentation section.
Formats the original model's documentation for inclusion in the
quantised model's README, preserving important context whilst
clearly separating it from the quantisation-specific information.
Returns:
Formatted original model section.
"""
if not original_readme:
return ""
return f"""
{separator}
## Original Model Information
{original_readme}
"""
def get_f16_row_template(
original_author: str,
model_name: str,
output_repo: str,
file_size: str = "-",
status: str = "completed",
) -> str:
"""Generate F16 GGUF row for the table.
Creates a formatted table row for the F16 reference model with
appropriate status indicators, download links, and file size
information based on upload status and availability.
Returns:
Formatted table row for F16.
"""
filename = f"{original_author}-{model_name}-f16.gguf"
url = f"https://huggingface.co/{output_repo}/blob/main/{filename}"
if status == "uploading":
status_text = f"⬆️ Uploading... ({file_size})"
elif status == "completed":
status_text = f"[✅ {file_size}]({url})"
else:
status_text = "⏳ Queued"
return f"| **F16** | Full precision reference | {status_text} |\n"
def get_troubleshooting_section() -> str:
"""Get troubleshooting section for README.
Returns:
Formatted troubleshooting markdown.
"""
return """
## 🔧 Troubleshooting
### File Not Found
- Ensure you're using the correct repository URL
- Check that the quantisation has completed ( status)
- Try refreshing the page if recently uploaded
### Performance Issues
- Use smaller quantisations for limited RAM/VRAM
- Q4_K_M offers the best balance for most users
- Enable GPU acceleration if available
### Compatibility
- K-quants require llama.cpp or compatible runtime
- Basic types (Q4_0, Q5_0, etc.) work with all runtimes
- Check your runtime's documentation for supported types
"""

View file

@ -1,6 +0,0 @@
"""Service layer for llm-gguf-tools.
Provides high-level service interfaces for interacting with external systems
including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
spelling conventions throughout.
"""

View file

@ -1,236 +0,0 @@
"""GGUF file operations service.
Provides unified interface for creating, writing, and manipulating GGUF files.
Consolidates GGUF-specific operations from conversion and quantisation workflows.
Uses UK English spelling conventions throughout.
"""
from __future__ import annotations
import gc
from typing import TYPE_CHECKING, Any, Protocol
import gguf
import torch
from safetensors import safe_open
from helpers.logger import logger
from helpers.services.filesystem import FilesystemService
from helpers.utils.config_parser import ConfigParser
class VisionConfig(Protocol):
"""Protocol for vision model configuration."""
hidden_size: int
num_hidden_layers: int
num_attention_heads: int
intermediate_size: int
patch_size: int
spatial_merge_size: int
class TensorMapper(Protocol):
"""Protocol for tensor name mapping."""
def map_tensor_name(self, name: str) -> str | None:
"""Map a tensor name to its GGUF equivalent."""
if TYPE_CHECKING:
from pathlib import Path
import numpy as np
from helpers.models.conversion import ModelConfig
class GGUFWriter:
"""Manages GGUF file creation and metadata writing.
Provides high-level interface for GGUF file operations including metadata
configuration, tensor addition, and tokeniser integration. Encapsulates
low-level GGUF library interactions for consistent error handling.
"""
def __init__(self, output_path: Path, architecture: str) -> None:
"""Initialise GGUF writer with output path and architecture.
Creates the underlying GGUF writer instance and prepares for metadata
and tensor addition. Sets up the file structure for the specified
model architecture.
"""
self.output_path = output_path
self.architecture = architecture
self.writer = gguf.GGUFWriter(str(output_path), architecture)
logger.info(f"Created GGUF writer for {architecture} architecture")
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
"""Add comprehensive metadata from model configuration.
Writes general model information, architectural parameters, and
quantisation settings to the GGUF file header. Handles both standard
and vision model configurations with appropriate parameter mapping.
"""
# General metadata
self.writer.add_name(model_name)
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
# Model parameters from config
params = model_config.to_gguf_params()
self.writer.add_context_length(params.context_length)
self.writer.add_embedding_length(params.embedding_length)
self.writer.add_block_count(params.block_count)
self.writer.add_feed_forward_length(params.feed_forward_length)
self.writer.add_head_count(params.attention_head_count)
self.writer.add_head_count_kv(params.attention_head_count_kv)
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
self.writer.add_rope_freq_base(params.rope_freq_base)
self.writer.add_rope_dimension_count(params.rope_dimension_count)
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
"""Add vision model parameters to GGUF metadata.
Configures vision-specific parameters for multimodal models including
embedding dimensions, attention heads, and spatial processing settings.
"""
if not vision_config:
return
logger.info("Adding vision model parameters...")
self.writer.add_vision_embedding_length(vision_config.hidden_size)
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
self.writer.add_vision_head_count(vision_config.num_attention_heads)
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
self.writer.add_vision_patch_size(vision_config.patch_size)
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
"""Add tokeniser metadata to GGUF file.
Writes special token IDs and tokeniser model type to enable proper
text processing during inference. Uses sensible defaults for missing
configuration values.
"""
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
logger.info("Added tokeniser configuration")
def add_tensor(self, name: str, data: np.ndarray) -> None:
"""Add a tensor to the GGUF file.
Writes tensor data with the specified name to the file. Handles
data type conversions and validates tensor shapes.
"""
self.writer.add_tensor(name, data)
def finalise(self) -> None:
"""Write all data to file and close writer.
Completes the GGUF file creation by writing headers, key-value data,
and tensor data in the correct order. Ensures proper file closure.
"""
logger.info(f"Writing GGUF file to {self.output_path}")
self.writer.write_header_to_file()
self.writer.write_kv_data_to_file()
self.writer.write_tensors_to_file()
self.writer.close()
logger.info("GGUF file written successfully")
class GGUFConverter:
"""High-level GGUF conversion orchestrator.
Coordinates the complete conversion workflow from source models to GGUF
format, managing metadata extraction, tensor mapping, and file writing.
"""
@staticmethod
def convert_safetensors(
model_path: Path,
output_path: Path,
model_config: ModelConfig,
architecture: str,
tensor_mapper: TensorMapper,
) -> bool:
"""Convert SafeTensors model to GGUF format.
Orchestrates the conversion process including metadata setup, tensor
loading with BFloat16 support, name mapping, and tokeniser integration.
Returns:
True if conversion successful, False otherwise.
"""
logger.info(f"Converting {model_path.name} to GGUF...")
# Create writer
writer_wrapper = GGUFWriter(output_path, architecture)
# Add metadata
writer_wrapper.add_metadata(model_config, model_path.name)
# Add vision metadata if present
if model_config.vision_config:
writer_wrapper.add_vision_metadata(model_config.vision_config)
# Load and add tensors
fs = FilesystemService()
tensor_files = fs.find_safetensor_files(model_path)
logger.info(f"Found {len(tensor_files)} tensor file(s)")
tensor_count = 0
for tensor_file in tensor_files:
logger.info(f"Loading {tensor_file.name}...")
with safe_open(tensor_file, framework="pt") as f:
for tensor_name in f.keys(): # noqa: SIM118
tensor_data = f.get_tensor(tensor_name)
# Convert BFloat16 to Float32
if hasattr(tensor_data, "numpy"):
if torch and tensor_data.dtype == torch.bfloat16:
tensor_data = tensor_data.float()
tensor_data = tensor_data.numpy()
# Map tensor name
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
if gguf_name:
writer_wrapper.add_tensor(gguf_name, tensor_data)
tensor_count += 1
if tensor_count % 100 == 0:
logger.info(f" Processed {tensor_count} tensors...")
# Free memory after processing each tensor
del tensor_data
# Force garbage collection after processing each file
gc.collect()
logger.info(f"Total tensors processed: {tensor_count}")
# Add tokeniser
try:
tok_config = ConfigParser.load_tokeniser_config(model_path)
writer_wrapper.add_tokeniser(tok_config)
logger.info("Tokeniser added")
except Exception as e:
logger.warning(f"Could not add tokeniser: {e}")
# Finalise file
writer_wrapper.finalise()
file_size = fs.get_file_size(output_path)
logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
return True

View file

@ -1,613 +0,0 @@
"""HuggingFace operations service.
Handles all interactions with HuggingFace including model downloads,
uploads, README generation, and repository management. Uses UK English
spelling conventions throughout.
"""
from __future__ import annotations
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.logger import logger
from helpers.models.quantisation import QuantisationType
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource, QuantisationResult
# Constants for file size formatting
GIBIBYTE = 1024**3
class HuggingFaceService:
"""Manages HuggingFace repository operations.
Provides methods for downloading models, uploading files, and managing
repositories. Handles authentication, error recovery, and progress tracking
for robust interaction with HuggingFace services.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Retrieves the current user's HuggingFace username using the CLI.
Requires prior authentication via `huggingface-cli login`.
Returns:
HuggingFace username.
Raises:
RuntimeError: If not authenticated or CLI not available.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
@staticmethod
def download_model(
model_name: str, output_dir: Path, include_pattern: str | None = None
) -> None:
"""Download model from HuggingFace.
Downloads a complete model or specific files matching a pattern.
Creates the output directory if it doesn't exist. Supports filtered
downloads for efficient bandwidth usage when only certain files are needed.
"""
logger.info(f"Downloading {model_name} to {output_dir}")
cmd = [
"huggingface-cli",
"download",
model_name,
"--local-dir",
str(output_dir),
]
if include_pattern:
cmd.extend(["--include", include_pattern])
subprocess.run(cmd, check=True, capture_output=True, text=True)
logger.info("Download complete")
@staticmethod
def upload_file(
repo_id: str,
local_path: Path,
repo_path: str | None = None,
create_repo: bool = False,
) -> None:
"""Upload a file to HuggingFace repository.
Uploads a single file to the specified repository path. Can create
the repository if it doesn't exist. Uses git directly when possible
to avoid automatic PR creation.
Raises:
CalledProcessError: If upload fails.
"""
repo_path = repo_path or local_path.name
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
# Try git-based upload first to avoid PR creation
if HuggingFaceService._try_git_upload(
repo_id, local_path, repo_path, create_repo=create_repo
):
logger.info(f"Uploaded {repo_path} via git")
return
# Fallback to huggingface-cli
logger.info("Git upload failed, trying huggingface-cli...")
cmd = [
"huggingface-cli",
"upload",
repo_id,
str(local_path),
repo_path,
"--revision",
"main", # Explicitly push to main branch
"--commit-message",
f"Add {repo_path}",
]
if create_repo:
cmd.append("--create")
try:
subprocess.run(cmd, check=True, capture_output=True)
logger.info(f"Uploaded {repo_path}")
except subprocess.CalledProcessError:
if create_repo:
# Repository might already exist, retry without --create
cmd = cmd[:-1] # Remove --create flag
subprocess.run(cmd, check=True, capture_output=True, text=True)
logger.info(f"Updated {repo_path}")
else:
raise
@staticmethod
def _try_git_upload(
repo_id: str,
local_path: Path,
repo_path: str,
*,
create_repo: bool = False,
) -> bool:
"""Try to upload file using git directly to avoid PR creation.
Returns:
bool: True if upload successful, False if should fallback to CLI.
"""
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
repo_url = f"https://huggingface.co/{repo_id}"
# Clone repository
logger.info(f"Cloning {repo_url}...")
result = subprocess.run(
["git", "clone", repo_url, str(temp_path / "repo")],
check=False,
capture_output=True,
text=True,
)
if result.returncode != 0:
if create_repo:
# Repository doesn't exist, let huggingface-cli handle creation
return False
logger.warning(f"Clone failed: {result.stderr}")
return False
repo_dir = temp_path / "repo"
target_file = repo_dir / repo_path
# Ensure target directory exists
target_file.parent.mkdir(parents=True, exist_ok=True)
# Copy file
shutil.copy2(local_path, target_file)
# Check if there are any changes
status_result = subprocess.run(
["git", "status", "--porcelain"],
cwd=repo_dir,
capture_output=True,
text=True,
check=True,
)
if not status_result.stdout.strip():
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
return True # File is already up-to-date, no need to push
# Git add, commit, push
subprocess.run(
["git", "add", repo_path],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "commit", "-m", f"Update {repo_path}"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "push"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
return True
except subprocess.CalledProcessError as e:
logger.warning(f"Git upload failed: {e}")
return False
except Exception as e:
logger.warning(f"Git upload error: {e}")
return False
class ReadmeGenerator:
"""Generates README files for quantised models.
Creates comprehensive README documentation including model cards,
quantisation details, and status tracking. Supports both initial
planning documentation and final result summaries.
"""
def generate(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str | None = None,
) -> Path:
"""Generate README file for quantised model repository.
Creates a comprehensive README with frontmatter, quantisation table,
and original model information. Handles status tracking for planned,
processing, and completed quantisations.
Returns:
Path to generated README file.
"""
logger.info("Creating model card...")
model_dir = models_dir / model_source.model_name
readme_path = model_dir / "README.md"
# Get original README content
original_content = self._get_original_readme(model_source, model_dir)
# Generate new README
readme_content = self._generate_readme_content(
model_source, results, original_content, output_repo
)
readme_path.write_text(readme_content)
return readme_path
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
"""Extract original README and metadata.
Downloads or reads the original model's README for inclusion in the
quantised model documentation. Parses YAML frontmatter if present.
Returns:
Dictionary with readme content, licence, tags, and frontmatter.
"""
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
# Check for preserved original README first
original_readme_path = model_dir / "README.original.md"
readme_path = model_dir / "README.md"
if original_readme_path.exists():
# Use the preserved original
content["readme"] = original_readme_path.read_text(encoding="utf-8")
logger.info(f"Found preserved original README ({len(content['readme'])} characters)")
elif readme_path.exists():
# First time - preserve the original and use it
readme_content = readme_path.read_text(encoding="utf-8")
# Check if this is already our generated README
if (
f"{model_source.original_author}-{model_source.model_name}-GGUF"
not in readme_content
):
# This is the original - preserve it
original_readme_path.write_text(readme_content, encoding="utf-8")
content["readme"] = readme_content
readme_len = len(content["readme"])
logger.info(
f"Preserved original README as README.original.md ({readme_len} characters)"
)
else:
# This is our generated README, need to download the original
logger.info("Found generated README, downloading original from source")
content = self._download_readme(model_source)
# Save the downloaded original for future use
if content["readme"]:
original_readme_path.write_text(content["readme"], encoding="utf-8")
logger.info("Preserved downloaded original README as README.original.md")
else:
# No local README - download from source
content = self._download_readme(model_source)
# Save the downloaded original for future use
if content["readme"]:
original_readme_path.write_text(content["readme"], encoding="utf-8")
logger.info("Preserved downloaded original README as README.original.md")
# Parse frontmatter if present
if content["readme"].startswith("---\n"):
content = self._parse_frontmatter(content["readme"])
return content
def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
"""Download README from HuggingFace repository.
Attempts to download just the README.md file from the source repository
for efficient documentation extraction.
Returns:
Dictionary with readme content and default metadata.
"""
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
with tempfile.TemporaryDirectory() as temp_dir:
try:
logger.info(f"Downloading README from {model_source.source_model}...")
subprocess.run(
[
"huggingface-cli",
"download",
model_source.source_model,
"--include",
"README.md",
"--local-dir",
temp_dir,
],
check=True,
capture_output=True,
)
readme_path = Path(temp_dir) / "README.md"
if readme_path.exists():
content["readme"] = readme_path.read_text(encoding="utf-8")
logger.info(f"Downloaded README ({len(content['readme'])} characters)")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to download README: {e}")
return content
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
"""Parse YAML frontmatter from README.
Extracts metadata from YAML frontmatter including licence, tags,
and other model card fields.
Returns:
Dictionary with separated content and metadata.
"""
lines = readme_text.split("\n")
if lines[0] != "---":
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter_end = -1
for i, line in enumerate(lines[1:], 1):
if line == "---":
frontmatter_end = i
break
if frontmatter_end == -1:
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter = "\n".join(lines[1:frontmatter_end])
content = "\n".join(lines[frontmatter_end + 1 :])
# Extract licence
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
# Extract tags
tags = []
in_tags = False
for line in frontmatter.split("\n"):
if line.startswith("tags:"):
in_tags = True
continue
if in_tags:
if line.startswith("- "):
tags.append(line[2:].strip())
elif line and not line.startswith(" "):
break
return {
"readme": content,
"licence": licence_val,
"tags": ",".join(tags),
"frontmatter": frontmatter,
}
def _generate_readme_content(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
original_content: dict[str, str],
output_repo: str | None = None,
) -> str:
"""Generate complete README content with quantisation details.
Creates the full README including YAML frontmatter, quantisation status
table, and original model information.
Returns:
Complete README markdown content.
"""
# Build tags
our_tags = [
"quantised",
"gguf",
"q3_k_m",
"q3_k_l",
"q3_k_xl",
"q4_k_m",
"q4_k_l",
"q5_k_m",
"q5_k_l",
"q6_k",
"q6_k_l",
"q8_0",
"bartowski-method",
]
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
all_tags = sorted(set(our_tags + original_tags))
# Build frontmatter
frontmatter = f"""---
license: {original_content["licence"]}
library_name: gguf
base_model: {model_source.source_model}
tags:
"""
for tag in all_tags:
if tag.strip():
frontmatter += f"- {tag.strip()}\n"
frontmatter += "---\n\n"
# Build main content
hf_url = f"https://huggingface.co/{model_source.source_model}"
content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
GGUF quantisations of [{model_source.source_model}]({hf_url}) using
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
which replicates Bartowski's quantisation profiles.
| Variant | Configuration | File Size | Status |
|---|---|---|---|
"""
# Add results table - group by layer config patterns
supported_types = [
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
QuantisationType.Q8_0,
]
for quant_type in supported_types:
result = results.get(quant_type)
if not result:
result = type("Result", (), {"status": "planned", "success": False})()
config = QUANTISATION_CONFIGS.get(quant_type)
file_size = self._format_file_size(result)
status = self._format_status(result, model_source, quant_type, output_repo)
# Get configuration description from the config itself
config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
content += """
**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN
See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
for more on the tools and methods I use.
"""
# Add original content
if original_content["readme"]:
content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
else:
content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model})."
return frontmatter + content
def _format_file_size(self, result: QuantisationResult) -> str:
"""Format file size for README table.
Returns:
Formatted file size string or dash if not available.
"""
if hasattr(result, "file_size") and result.file_size:
return result.file_size
if hasattr(result, "success") and result.success and hasattr(result, "file_path"):
# Try to get file size from path if available
try:
if result.file_path and Path(result.file_path).exists():
size_bytes = Path(result.file_path).stat().st_size
size_gb = size_bytes / GIBIBYTE
return f"{size_gb:.1f}GB"
except Exception:
pass
return "-"
def _format_status(
self,
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format status indicator for README table.
Creates appropriate status indicator based on quantisation state
including progress indicators, file sizes, and download links.
Returns:
Formatted status string for table cell.
"""
status_map = {
"planned": "⏳ Queued",
"processing": "🔄 Processing...",
"uploading": "⬆️ Uploading...",
"failed": "❌ Failed",
}
if hasattr(result, "status") and result.status in status_map:
base_status = status_map[result.status]
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
return f"{base_status} ({result.file_size})"
if result.status == "completed" or (hasattr(result, "success") and result.success):
return self._format_success_status(result, model_source, quant_type, output_repo)
return base_status
# Legacy support
if hasattr(result, "success") and result.success:
return self._format_success_status(result, model_source, quant_type, output_repo)
return "❌ Failed"
def _format_success_status(
self,
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format successful quantisation status with download link.
Creates a download link if repository information is available,
otherwise shows file size.
Returns:
Formatted success status string.
"""
if not output_repo:
return (
f"{result.file_size}"
if hasattr(result, "file_size") and result.file_size
else "✅ Available"
)
filename = (
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
)
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
if hasattr(result, "file_size") and result.file_size:
return f"[✅ {result.file_size}]({url})"
return f"[✅ Available]({url})"

View file

@ -1,83 +0,0 @@
"""Importance matrix (imatrix) management service.
Manages detection and use of existing importance matrix files for
quantisation guidance. Provides user prompts for supplying pre-computed
imatrix files from external sources.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from pathlib import Path
class IMatrixManager:
"""Handles importance matrix file management for quantisation.
Locates existing importance matrix files or prompts users to provide
pre-computed matrices from external sources. These matrices guide
quantisation decisions to preserve model quality.
"""
def __init__(self) -> None:
"""Initialise IMatrixManager."""
self.fs = FilesystemService()
def find_imatrix(self, model_dir: Path) -> Path | None:
"""Find or prompt for importance matrix file.
Searches for existing imatrix files first, then provides interactive
prompts for user-supplied matrices. See docs/imatrix_data.md for
instructions on generating imatrix files.
Returns:
Path to imatrix file, or None if not available.
"""
imatrix_path = model_dir / "imatrix.dat"
# Check for existing imatrix
if imatrix_path.exists():
logger.info(f"Found existing imatrix: {imatrix_path.name}")
return imatrix_path
# Try user-provided imatrix
return self._prompt_for_user_imatrix(model_dir, imatrix_path)
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
"""Prompt user for existing imatrix file.
Returns:
Path to user-provided imatrix, or None if not available.
"""
logger.info(f"Model directory: {model_dir}")
logger.info(f"Looking for imatrix file at: {imatrix_path}")
logger.info("\n" + "=" * 70)
logger.info("📊 No existing imatrix file found")
logger.info("\nYou have two options:")
logger.info(" 1. Provide a pre-computed imatrix file")
logger.info(" (💡 see docs/imatrix_data.md to generate your own)")
logger.info(" 2. Skip imatrix usage (lower quality quantisation)")
logger.info("=" * 70)
response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
if response != "y":
logger.info("Continuing without imatrix (quantisation quality may be lower)")
logger.info(" See docs/imatrix_data.md for instructions on generating imatrix files") # noqa: RUF001
return None
logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"✅ Found imatrix file! ({file_size})")
return imatrix_path
logger.warning("No imatrix.dat file found - continuing without imatrix")
return None

View file

@ -1,756 +0,0 @@
"""Python API wrapper for llama-cpp-python quantisation operations.
Provides high-level Python interfaces for model quantisation using llama-cpp-python
bindings. Implements partial tensor-specific quantisation support through embedding
and output tensor type configuration.
"""
from __future__ import annotations
import ctypes
import gc
import logging
import os
import signal
import sys
import traceback
from typing import TYPE_CHECKING, Any, ClassVar, Never
import psutil
from helpers.logger import logger
from helpers.services.gguf import GGUFConverter
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import QuantisationConfig
# Import llama_cpp when needed
try:
import llama_cpp
from llama_cpp import llama_model_quantize_params
LLAMA_CPP_AVAILABLE = True
except ImportError:
LLAMA_CPP_AVAILABLE = False
logger.warning("llama-cpp-python not available - falling back to binary mode")
class LlamaCppPythonAPI:
"""Python API wrapper for llama.cpp quantisation operations.
Provides direct Python access to quantisation functionality using llama-cpp-python
bindings. Implements partial tensor-specific quantisation through token embedding
and output tensor type configuration, which provides differentiation between
Q4_K variants even without full per-layer tensor control.
"""
# Mapping of custom variant prefixes to their base types
VARIANT_BASE_MAPPING: ClassVar[dict[str, str]] = {
"Q3_K_": "Q3_K_M",
"Q4_K_": "Q4_K_M",
"Q5_K_": "Q5_K_M",
"Q6_K_": "Q6_K",
}
@staticmethod
def is_available() -> bool:
"""Check if llama-cpp-python is available for use.
Returns:
True if llama-cpp-python bindings are installed and functional.
"""
return LLAMA_CPP_AVAILABLE
@staticmethod
def get_quantisation_type(config_name: str) -> int:
"""Map configuration name to llama_cpp quantisation type constant.
Supports a wide range of quantisation types from Q2 to Q8, including
K-quants and legacy formats. Handles both simple formats (Q4_K_M, Q6_K)
and custom suffixed variants (Q4_K_M_L, Q5_K_M_XL) by mapping them to
their base types for llama-cpp-python compatibility.
Returns:
llama_cpp quantisation type constant for base quantisation.
Raises:
RuntimeError: If llama-cpp-python is not available.
ValueError: If the quantisation type is not supported.
"""
if not LLAMA_CPP_AVAILABLE:
msg = "llama-cpp-python not available"
raise RuntimeError(msg)
# Normalise the config name to extract base type
# E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
# E.g., "Q4_K_M_XXL" -> "Q4_K_M"
config_upper = config_name.upper()
# Direct mapping for exact matches
type_mapping = {
# Q2 variants (not recommended but supported)
"Q2_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K,
"Q2_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K_S,
# Q3 K-quants
"Q3_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_S,
"Q3_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_M,
# Q4 K-quants (most common)
"Q4_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_S,
"Q4_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_M,
# Q5 K-quants
"Q5_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_S,
"Q5_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_M,
# Q6_K (single variant)
"Q6_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q6_K,
# Q8_0 (highest common quantisation)
"Q8_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q8_0,
# Legacy quantisation formats
"Q4_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_0,
"Q4_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_1,
"Q5_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_0,
"Q5_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_1,
# IQ (Integer Quantisation) variants - experimental
"IQ2_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XXS,
"IQ2_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XS,
"IQ2_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_S,
"IQ2_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_M,
"IQ3_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XXS,
"IQ3_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XS,
"IQ3_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_S,
"IQ3_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_M,
"IQ4_NL": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_NL,
"IQ4_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_XS,
# Higher precision formats
"F16": llama_cpp.LLAMA_FTYPE_MOSTLY_F16,
"BF16": llama_cpp.LLAMA_FTYPE_MOSTLY_BF16,
}
# Try direct lookup first
if config_upper in type_mapping:
return type_mapping[config_upper]
# Handle custom variants using base mapping
for prefix, base_type in LlamaCppPythonAPI.VARIANT_BASE_MAPPING.items():
if config_upper.startswith(prefix) and config_upper not in type_mapping:
return type_mapping[base_type]
# If not found, raise an informative error
supported = sorted(type_mapping.keys())
msg = (
f"Unsupported quantisation type: {config_name}\n"
f"Supported types: {', '.join(supported)}\n"
f"Custom variants like Q4_K_L, Q4_K_XL are also supported."
)
raise ValueError(msg)
@staticmethod
def get_tensor_type_value(type_name: str) -> int:
"""Convert tensor type name to llama_cpp constant.
Maps string tensor type names to their corresponding llama_cpp integer
constants for tensor-specific overrides. Provides the foundation for
differentiated quantisation strategies across embedding and output layers.
Returns:
Integer value for the tensor type, or 0 if not found.
"""
if not LLAMA_CPP_AVAILABLE:
return 0
# Build mapping with variant consolidation
# All Q3_K variants map to base Q3_K type, same for Q4_K and Q5_K
type_mapping = LlamaCppPythonAPI._build_tensor_type_mapping()
return type_mapping.get(type_name.upper(), 0)
@staticmethod
def _build_tensor_type_mapping() -> dict[str, int]:
"""Build tensor type mapping with variant consolidation.
Returns:
Dictionary mapping type names to GGML constants.
"""
if not LLAMA_CPP_AVAILABLE:
return {}
# Base mappings
return {
# Q2 variants
"Q2_K": llama_cpp.GGML_TYPE_Q2_K,
# Q3 variants - all map to base Q3_K
"Q3_K": llama_cpp.GGML_TYPE_Q3_K,
"Q3_K_S": llama_cpp.GGML_TYPE_Q3_K,
"Q3_K_M": llama_cpp.GGML_TYPE_Q3_K,
"Q3_K_L": llama_cpp.GGML_TYPE_Q3_K,
# Q4 variants
"Q4_0": llama_cpp.GGML_TYPE_Q4_0,
"Q4_1": llama_cpp.GGML_TYPE_Q4_1,
"Q4_K": llama_cpp.GGML_TYPE_Q4_K,
"Q4_K_S": llama_cpp.GGML_TYPE_Q4_K,
"Q4_K_M": llama_cpp.GGML_TYPE_Q4_K,
# Q5 variants
"Q5_0": llama_cpp.GGML_TYPE_Q5_0,
"Q5_1": llama_cpp.GGML_TYPE_Q5_1,
"Q5_K": llama_cpp.GGML_TYPE_Q5_K,
"Q5_K_S": llama_cpp.GGML_TYPE_Q5_K,
"Q5_K_M": llama_cpp.GGML_TYPE_Q5_K,
# Q6 variant
"Q6_K": llama_cpp.GGML_TYPE_Q6_K,
# Q8 variant
"Q8_0": llama_cpp.GGML_TYPE_Q8_0,
# Higher precision
"F16": llama_cpp.GGML_TYPE_F16,
"F32": llama_cpp.GGML_TYPE_F32,
}
def quantise_model_flexible(
self,
input_path: Path,
output_path: Path,
base_type: str,
embedding_type: str | None = None,
output_type: str | None = None,
imatrix_path: Path | None = None,
) -> bool:
"""Quantise model with flexible tensor type configuration.
Provides control over base quantisation type with optional overrides for
embeddings and output layers, which are the only tensor-specific controls
that work reliably with llama-cpp-python.
Args:
input_path: Path to input GGUF model.
output_path: Path for output quantised model.
base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
embedding_type: Override for token embeddings (None = use base).
output_type: Override for output/lm_head layers (None = use base).
imatrix_path: Optional importance matrix file.
Returns:
True if quantisation successful, False otherwise.
Examples:
# Q4_K_L: Q4_K_M base with Q8_0 embeddings
api.quantise_model_flexible(
input_path, output_path, "Q4_K_M",
embedding_type="Q8_0"
)
# Q3_K_L: Q3_K_M base with Q5_K output
api.quantise_model_flexible(
input_path, output_path, "Q3_K_M",
output_type="Q5_K"
)
# Q3_K_XL: Q3_K_M with both Q8_0 embeddings and Q5_K output
api.quantise_model_flexible(
input_path, output_path, "Q3_K_M",
embedding_type="Q8_0",
output_type="Q5_K"
)
Raises:
RuntimeError: If llama-cpp-python is not available.
"""
if not LLAMA_CPP_AVAILABLE:
msg = "llama-cpp-python not available for quantisation"
raise RuntimeError(msg)
logger.info(f"🔄 Flexible quantisation: {base_type} base")
logger.info(f"📝 Input: {input_path}")
logger.info(f"📝 Output: {output_path}")
# Setup phase - create and configure parameters
params = self._create_params(base_type, imatrix_path)
self._apply_tensor_overrides(params, embedding_type, output_type)
# Execution phase - perform quantisation
try:
logger.debug("DEBUG: Starting flexible quantisation execution")
result = self._do_quantisation(input_path, output_path, params)
logger.debug(f"DEBUG: Flexible quantisation returned: {result}")
except Exception as e:
logger.error(f"❌ Flexible quantisation failed with exception: {e}")
logger.error("Flexible quantisation traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return False
else:
if result == 0:
# Verify output file was created and is valid
if not output_path.exists():
logger.error(
f"❌ Quantisation claimed success but output does not exist: {output_path}"
)
return False
try:
output_size = output_path.stat().st_size
logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
if output_size == 0:
logger.error("❌ Output file is empty despite success code")
return False
except Exception as e:
logger.warning(f"⚠️ Could not check output file size: {e}")
logger.info(f"✅ Quantisation successful: {output_path.name}")
return True
logger.error(f"❌ Quantisation failed with code: {result}")
return False
def _create_params(
self, base_type: str, imatrix_path: Path | None
) -> llama_model_quantize_params:
"""Create quantisation parameters.
Returns:
Configured quantisation parameters.
"""
params = llama_model_quantize_params()
params.ftype = self.get_quantisation_type(base_type)
params.nthread = 8
params.allow_requantize = True
if imatrix_path and imatrix_path.exists():
# Convert path to bytes and create c_char_p, then cast to c_void_p
imatrix_bytes = str(imatrix_path).encode("utf-8")
char_p = ctypes.c_char_p(imatrix_bytes)
params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
return params
def _apply_tensor_overrides(
self,
params: llama_model_quantize_params,
embedding_type: str | None,
output_type: str | None,
) -> None:
"""Apply embedding and output tensor type overrides to params.
These are the only tensor-specific controls that work reliably
with llama-cpp-python.
"""
# Apply embedding override if specified
if embedding_type:
params.token_embedding_type = self.get_tensor_type_value(embedding_type)
logger.info(f"⚙️ Token embedding type: {embedding_type}")
# Apply output override if specified
if output_type:
params.output_tensor_type = self.get_tensor_type_value(output_type)
params.quantize_output_tensor = True
logger.info(f"⚙️ Output tensor type: {output_type}")
def _do_quantisation(
self,
input_path: Path,
output_path: Path,
params: llama_model_quantize_params,
) -> int:
"""Perform the quantisation operation.
Returns:
Return code (0 for success).
Raises:
KeyboardInterrupt: If the user interrupts the quantisation process.
SystemExit: If the system exits during quantisation.
"""
logger.debug("DEBUG: Calling llama_cpp.llama_model_quantize")
try:
# Flush any pending output before calling C library
sys.stdout.flush()
sys.stderr.flush()
# Temporarily redirect stderr to prevent terminal control issues
# Some GGUF models output control sequences that can break the terminal
old_stderr_fd = None
devnull_fd = None
try:
# Only redirect if not in debug mode to preserve error messages
if not logger.isEnabledFor(logging.DEBUG):
old_stderr_fd = os.dup(2) # Save current stderr
devnull_fd = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull_fd, 2) # Redirect stderr to /dev/null
# Call the quantization with proper exception handling
result = llama_cpp.llama_model_quantize(
str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
)
finally:
# Restore stderr if we redirected it
if old_stderr_fd is not None:
os.dup2(old_stderr_fd, 2)
os.close(old_stderr_fd)
if devnull_fd is not None:
os.close(devnull_fd)
# Flush output after the call
sys.stdout.flush()
sys.stderr.flush()
except KeyboardInterrupt:
logger.error("❌ Quantisation interrupted by user")
raise
except SystemExit as e:
logger.error(f"❌ System exit during quantisation: {e}")
raise
except Exception as e:
logger.error(f"❌ llama_model_quantize call failed: {e}")
logger.error("llama_model_quantize call traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
else:
logger.debug(f"DEBUG: llama_model_quantize completed with code: {result}")
return result
def quantise_model(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None = None,
) -> bool:
"""Quantise model using Python API.
Performs quantisation using llama-cpp-python's direct API access with
support for embedding and output tensor type overrides. The L and XL
variants use a base type with specific overrides.
Returns:
True if quantisation successful, False otherwise.
Raises:
RuntimeError: If llama-cpp-python is not available.
"""
if not LLAMA_CPP_AVAILABLE:
msg = "llama-cpp-python not available for quantisation"
raise RuntimeError(msg)
# Force cleanup before starting
gc.collect()
# Log initial resource state
mem_before = self._log_resource_state("before")
try:
# Validate input
if not self._validate_input_file(input_path):
return False
# Setup parameters
params = self._setup_quantisation_params(config, imatrix_path)
if params is None:
return False
# Execute quantisation
result = self._execute_quantisation(input_path, output_path, params)
# Verify and finalize
if result == 0:
return self._finalize_successful_quantisation(output_path, mem_before)
logger.error(f"❌ Quantisation failed with code: {result}")
except Exception as e:
logger.error(f"❌ Quantisation failed with exception: {e}")
logger.error("Full quantisation traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
# Garbage collect and return false
gc.collect()
return False
def _log_resource_state(self, phase: str) -> float:
"""Log current resource usage state.
Args:
phase: Description of current phase (e.g., "before", "after").
Returns:
Current memory usage in GB.
"""
process = psutil.Process()
memory_gb = process.memory_info().rss / (1024**3)
logger.debug(f"DEBUG: Memory {phase} quantisation: {memory_gb:.2f} GB")
logger.debug(f"DEBUG: Open file descriptors: {len(process.open_files())}")
if phase == "before":
logger.debug(f"DEBUG: Process PID: {process.pid}")
return memory_gb
def _validate_input_file(self, input_path: Path) -> bool:
"""Validate input file exists and is readable.
Args:
input_path: Path to input file.
Returns:
True if file is valid, False otherwise.
"""
logger.debug(f"DEBUG: Starting quantisation of {input_path.name}")
logger.info(f"🔄 Quantising {input_path.name}...")
logger.debug(f"DEBUG: Input: {input_path}")
if not input_path.exists():
logger.error(f"❌ Input file does not exist: {input_path}")
return False
if not input_path.is_file():
logger.error(f"❌ Input path is not a file: {input_path}")
return False
try:
input_size = input_path.stat().st_size
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
if input_size == 0:
logger.error("❌ Input file is empty")
return False
except Exception as e:
logger.warning(f"⚠️ Could not check input file size: {e}")
return True
def _setup_quantisation_params(
self,
config: QuantisationConfig,
imatrix_path: Path | None,
) -> llama_model_quantize_params | None:
"""Setup quantisation parameters.
Args:
config: Quantisation configuration.
imatrix_path: Optional path to importance matrix.
Returns:
Configured parameters or None if setup failed.
"""
logger.debug("DEBUG: Setting up quantisation parameters")
params = llama_model_quantize_params()
# Set base quantisation type
try:
params.ftype = self.get_quantisation_type(config.base_type)
logger.debug(
f"DEBUG: Set ftype to {params.ftype} for {config.base_type} (config: {config.name})"
)
except Exception as e:
logger.error(f"❌ Failed to get quantisation type for {config.name}: {e}")
return None
# Configure basic parameters
params.nthread = 8
params.allow_requantize = True
logger.debug(
f"DEBUG: Set nthread={params.nthread}, allow_requantize={params.allow_requantize}"
)
# Add imatrix if available
if imatrix_path and imatrix_path.exists():
try:
# Convert path to bytes and create c_char_p, then cast to c_void_p
imatrix_bytes = str(imatrix_path).encode("utf-8")
char_p = ctypes.c_char_p(imatrix_bytes)
params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
logger.debug(f"DEBUG: imatrix path set: {imatrix_path}")
except Exception as e:
logger.error(f"❌ Failed to set imatrix: {e}")
# Continue without imatrix
# Configure tensor-specific types
logger.debug("DEBUG: Configuring tensor-specific types")
try:
self._configure_tensor_types(params, config)
logger.debug("DEBUG: Tensor types configured successfully")
except Exception as e:
logger.error(f"❌ Failed to configure tensor types: {e}")
logger.error("Tensor type configuration traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
# Continue with default types
return params
def _execute_quantisation(
self,
input_path: Path,
output_path: Path,
params: llama_model_quantize_params,
) -> int:
"""Execute the actual quantisation with signal handling.
Args:
input_path: Path to input model.
output_path: Path for output model.
params: Configured quantisation parameters.
Returns:
Return code from quantisation (0 for success).
"""
logger.debug("DEBUG: Starting llama_cpp.llama_model_quantize call")
logger.debug("DEBUG: About to call llama_model_quantize...")
# Setup signal handlers
old_handlers = self._setup_signal_handlers()
try:
result = llama_cpp.llama_model_quantize(
str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
)
logger.debug(f"DEBUG: llama_model_quantize returned: {result}")
except Exception as e:
logger.error(f"❌ llama_model_quantize raised exception: {e}")
logger.error("llama_model_quantize traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return -1
else:
return result
finally:
self._restore_signal_handlers(old_handlers)
def _setup_signal_handlers(self) -> tuple[Any, Any | None]:
"""Setup signal handlers for debugging termination.
Returns:
Tuple of (old_sigterm, old_sigsegv) handlers.
"""
def signal_debug_handler(signum: int, frame: object) -> Never: # noqa: ARG001
logger.error(f"DEBUG: Received signal {signum} during quantisation!")
logger.error(f"DEBUG: Signal name: {signal.Signals(signum).name}")
msg = f"Signal {signum} received"
raise KeyboardInterrupt(msg)
old_sigterm = signal.signal(signal.SIGTERM, signal_debug_handler)
old_sigsegv = (
signal.signal(signal.SIGSEGV, signal_debug_handler)
if hasattr(signal, "SIGSEGV")
else None
)
return old_sigterm, old_sigsegv
def _restore_signal_handlers(self, handlers: tuple[Any, Any | None]) -> None:
"""Restore original signal handlers.
Args:
handlers: Tuple of (old_sigterm, old_sigsegv) handlers.
"""
old_sigterm, old_sigsegv = handlers
signal.signal(signal.SIGTERM, old_sigterm)
if old_sigsegv is not None:
signal.signal(signal.SIGSEGV, old_sigsegv)
def _finalize_successful_quantisation(
self,
output_path: Path,
mem_before: float,
) -> bool:
"""Finalize successful quantisation and verify output.
Args:
output_path: Path to output file.
mem_before: Memory usage before quantisation in GB.
Returns:
True if output is valid, False otherwise.
"""
logger.debug("DEBUG: Quantisation returned success code")
# Verify output exists
if not output_path.exists():
logger.error(
f"❌ Quantisation claimed success but output does not exist: {output_path}"
)
return False
# Verify output size
output_size = output_path.stat().st_size
logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
if output_size == 0:
logger.error("❌ Output file is empty despite success code")
return False
logger.info(f"✅ Quantisation successful: {output_path.name}")
# Force cleanup and log final state
gc.collect()
mem_after = self._log_resource_state("after")
logger.debug(f"DEBUG: Memory delta: {mem_after - mem_before:+.2f} GB")
return True
def _configure_tensor_types(
self, params: llama_model_quantize_params, config: QuantisationConfig
) -> None:
"""Configure tensor-specific quantisation types.
Sets embedding and output tensor type overrides based on config.
These are the only tensor-specific controls that work reliably
with llama-cpp-python.
"""
logger.debug(f"DEBUG: _configure_tensor_types called for {config.name}")
# Apply embedding override if specified
if config.embedding_type:
params.token_embedding_type = self.get_tensor_type_value(config.embedding_type)
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
# Apply output override if specified
if config.output_type:
params.output_tensor_type = self.get_tensor_type_value(config.output_type)
params.quantize_output_tensor = True
logger.info(f"⚙️ Output tensor type: {config.output_type}")
def convert_hf_to_gguf(
self, input_dir: Path, output_path: Path, output_type: str = "f16"
) -> bool:
"""Convert HuggingFace model to GGUF format using native Python converter.
Uses our GGUFConverter for SafeTensors models, providing full Python-based
conversion without external dependencies.
Returns:
True if conversion successful, False otherwise.
"""
logger.info(f"🔄 Converting {input_dir.name} to GGUF format...")
logger.info(f"📝 Input: {input_dir}")
logger.info(f"📝 Output: {output_path}")
logger.info(f"📝 Type: {output_type}")
# Check for SafeTensors files
safetensor_files = list(input_dir.glob("*.safetensors"))
if not safetensor_files:
logger.warning("⚠️ No SafeTensors files found in model directory")
return False
try:
# Load model configuration
config_parser = ConfigParser()
model_config = config_parser.load_model_config(input_dir)
# Get architecture mapping
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
arch = config_parser.get_architecture_mapping(arch_name)
if arch != arch_name:
logger.info(f"📝 Architecture mapping: {arch_name}{arch}")
# Convert using GGUFConverter
tensor_mapper = TensorMapper()
success = GGUFConverter.convert_safetensors(
input_dir, output_path, model_config, arch, tensor_mapper
)
except Exception as e:
logger.error(f"❌ Conversion failed with exception: {e}")
return False
else:
if success:
logger.info("✅ Native Python conversion successful")
return success

View file

@ -1,618 +0,0 @@
"""Quantisation orchestration service.
High-level orchestration of the complete quantisation workflow from model
acquisition through processing to upload. Manages parallel processing,
status tracking, and cleanup operations for efficient resource utilisation.
"""
from __future__ import annotations
import gc
import signal
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
import psutil
from helpers.config.quantisation_configs import (
DEFAULT_QUANTISATION_TYPES,
QUANTISATION_CONFIGS,
SUPPORTED_QUANTISATION_TYPES,
)
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.llama_cpp import IMatrixManager
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.tensor_mapping import URLParser
if TYPE_CHECKING:
from types import FrameType
@dataclass(slots=True)
class QuantisationOrchestrator:
"""Orchestrates the complete quantisation workflow.
Uses dataclass with slots for efficient memory usage and dependency injection
for modular service interaction following SOLID principles.
"""
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
use_imatrix: bool = True
no_upload: bool = False
custom_profiles: list[str] | None = None
# Service dependencies with factory defaults
url_parser: URLParser = field(default_factory=URLParser)
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
# Computed properties
models_dir: Path = field(init=False)
model_manager: ModelManager = field(init=False)
def __post_init__(self) -> None:
"""Initialise computed properties after dataclass construction."""
self.models_dir = self.work_dir / "models"
self.model_manager = ModelManager(self.models_dir)
# Set up signal handlers for graceful exit tracking
self._setup_signal_handlers()
def _setup_signal_handlers(self) -> None:
"""Set up signal handlers to catch unexpected exits."""
def signal_handler(signum: int, frame: FrameType | None) -> None:
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
logger.error("Stack trace at signal:")
if frame:
for line in traceback.format_stack(frame):
logger.error(f" {line.strip()}")
logger.error("Exiting due to signal")
sys.exit(1)
# Handle common termination signals
for sig in [signal.SIGINT, signal.SIGTERM]:
signal.signal(sig, signal_handler)
def get_quantisation_types(self) -> list[QuantisationType]:
"""Get the quantisation types to use for this run.
Returns:
List of QuantisationType enums to process.
"""
if self.custom_profiles:
# Parse custom profiles from strings to QuantisationType
result = []
for profile_str in self.custom_profiles:
try:
profile = QuantisationType(profile_str.upper())
if profile in SUPPORTED_QUANTISATION_TYPES:
result.append(profile)
else:
logger.warning(f"Profile {profile_str} is not supported, skipping")
except ValueError:
logger.warning(f"Invalid profile {profile_str}, skipping")
return result or DEFAULT_QUANTISATION_TYPES
return DEFAULT_QUANTISATION_TYPES
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
"""Main quantisation workflow orchestrating model processing from URL to upload.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
Raises:
KeyboardInterrupt: If the user interrupts the quantisation process.
"""
logger.info("Starting Bartowski quantisation process...")
logger.debug(f"DEBUG: Input URL: {url}")
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
logger.debug(f"DEBUG: No upload: {self.no_upload}")
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
try:
# Setup and preparation
logger.debug("DEBUG: Starting environment setup...")
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
# Create initial repository
logger.debug("DEBUG: Creating initial repository...")
self._create_initial_repository(model_source, output_repo)
logger.debug("DEBUG: Initial repository created")
# Execute all quantisations
logger.debug("DEBUG: Starting quantisation execution...")
results = self._execute_quantisations(
model_source, f16_model_path, imatrix_path, output_repo
)
logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
# Cleanup
logger.debug("DEBUG: Starting cleanup...")
self._cleanup_files(f16_model_path, model_source)
logger.debug("DEBUG: Cleanup complete")
self._print_completion_summary(model_source, results, output_repo)
except KeyboardInterrupt:
logger.error("❌ Process interrupted by user (Ctrl+C)")
raise
except Exception as e:
logger.error(f"❌ Critical error in quantisation workflow: {e}")
logger.error("Full traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
else:
return results
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
"""Setup environment and prepare model for quantisation.
Returns:
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
"""
model_source = self.url_parser.parse(url)
self._print_model_info(model_source)
self.models_dir.mkdir(parents=True, exist_ok=True)
f16_model_path = self.model_manager.prepare_model(model_source)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
imatrix_path = self.imatrix_manager.find_imatrix(
self.models_dir / model_source.model_name
)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
return model_source, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
"""Create initial repository with planned quantisations."""
logger.info("Creating initial README with planned quantisations...")
quantisation_types = self.get_quantisation_types()
planned_results = {
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
for qt in quantisation_types
}
readme_path = self.readme_generator.generate(
model_source, planned_results, self.models_dir, output_repo
)
if not self.no_upload:
logger.info("Creating repository with planned quantisations...")
self.uploader.upload_readme(output_repo, readme_path)
else:
logger.info("Skipping repository creation (--no-upload specified)")
def _execute_quantisations(
self,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
quantisation_types = self.get_quantisation_types()
types_list = [qt.value for qt in quantisation_types]
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
# Process with parallel uploads - quantise sequentially but upload in background
upload_futures = []
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
for i, quant_type in enumerate(quantisation_types, 1):
logger.info(
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
)
logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
logger.debug(f"DEBUG: Current type: {quant_type.value}")
logger.debug(f"DEBUG: Results so far: {len(results)} completed")
try:
result = self._process_single_quantisation(
quant_type,
model_source,
f16_model_path,
imatrix_path,
output_repo,
results,
upload_executor,
upload_futures,
)
results[quant_type] = result
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
# Force cleanup between quantisations
gc.collect()
logger.debug("DEBUG: Garbage collection completed")
except Exception as e:
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message=str(e),
)
# Force cleanup after error
gc.collect()
# Wait for all uploads to complete before returning
self._wait_for_uploads(upload_futures)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
self._handle_quantisation_result(
result,
quant_type,
model_source,
results,
output_repo,
upload_executor,
upload_futures,
)
except Exception as e:
return self._handle_quantisation_error(
e, quant_type, model_source, results, output_repo
)
else:
return result
def _process_single_quantisation_sequential(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
) -> QuantisationResult:
"""Process a single quantisation type sequentially with immediate upload.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
# Force cleanup before starting new quantisation
gc.collect()
# Log system state before quantisation
process = psutil.Process()
logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
logger.debug(f"DEBUG: Process alive: {process.is_running()}")
logger.debug(f"DEBUG: PID: {process.pid}")
logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
logger.debug(f"DEBUG: Threads: {process.num_threads()}")
logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
if result.success and result.file_path:
# Upload immediately (if not in no-upload mode)
if not self.no_upload:
logger.info(f"Uploading {quant_type.value}...")
try:
self.uploader.upload_model_file(output_repo, result.file_path)
logger.info(f"Upload of {quant_type.value} completed successfully")
# Clean up file after successful upload
logger.info(f"Removing {result.file_path.name} to save disk space...")
result.file_path.unlink()
result.status = "completed"
self._update_readme_status(model_source, results, output_repo)
except Exception as upload_error:
logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
result.status = "failed"
result.error_message = str(upload_error)
self._update_readme_status(model_source, results, output_repo)
# Keep file if upload failed
else:
# No upload mode - just mark as completed
result.status = "completed"
logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
except Exception as e:
logger.error(f"Error processing {quant_type.value}: {e}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(e)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
# Force cleanup after error
gc.collect()
return result
else:
# Force cleanup after quantisation
gc.collect()
return result
def _handle_quantisation_result(
self,
result: QuantisationResult,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Handle successful or failed quantisation result."""
if result.success and result.file_path:
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
def _handle_quantisation_error(
self,
error: Exception,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> QuantisationResult:
"""Handle quantisation processing error.
Returns:
QuantisationResult: Failed quantisation result with error information.
"""
logger.error(f"Error processing {quant_type.value}: {error}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(error)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Update README with current quantisation status."""
if not self.no_upload:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
logger.info("Waiting for any remaining uploads to complete...")
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
except Exception as e:
logger.warning(f"Upload error: {e}")
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
"""Clean up temporary files after processing."""
if f16_model_path.exists():
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
f16_model_path.unlink()
if not model_source.is_gguf_repo:
self._cleanup_original_model(model_source)
def _cleanup_original_model(self, model_source: ModelSource) -> None:
"""Clean up original safetensors/PyTorch files after successful conversion."""
model_dir = self.models_dir / model_source.model_name
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
if pytorch_files:
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
for file in pytorch_files:
file.unlink()
logger.info("Keeping config files, tokeniser, and metadata for reference")
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
file_path.unlink()
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
)
# Don't re-raise - let other uploads continue
def _print_model_info(self, model_source: ModelSource) -> None:
"""Print model information."""
logger.info(f"Source URL: {model_source.url}")
logger.info(f"Source model: {model_source.source_model}")
logger.info(f"Original author: {model_source.original_author}")
logger.info(f"Model name: {model_source.model_name}")
logger.info(f"Your HF username: {self.uploader.get_username()}")
logger.info(f"Working directory: {self.work_dir}")
def _print_completion_summary(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Print completion summary."""
successful_results = [r for r in results.values() if r.success]
if successful_results:
logger.info("Complete! Your quantised models are available at:")
logger.info(f" https://huggingface.co/{output_repo}")
logger.info("Model info:")
logger.info(f" - Source URL: {model_source.url}")
logger.info(f" - Original: {model_source.source_model}")
logger.info(
" - Method: "
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
)
logger.info(f" - Quantised: {output_repo}")
for result in successful_results:
if result.file_size:
filename = (
f"{model_source.original_author}-{model_source.model_name}-"
f"{result.quantisation_type}.gguf"
)
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
else:
logger.error(
"All quantisations failed - repository created with documentation "
"but no model files"
)
logger.error(f" Repository: https://huggingface.co/{output_repo}")

View file

@ -1,675 +0,0 @@
"""Quantisation operations service.
Provides modular quantisation engine, model management, and upload capabilities
for GGUF model processing. Consolidates quantisation logic from various tools
into reusable components following SOLID principles.
"""
from __future__ import annotations
import shutil
import subprocess
import tempfile
import traceback
from pathlib import Path
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.filesystem import FilesystemService
from helpers.services.gguf import GGUFConverter
from helpers.services.llama_python import LlamaCppPythonAPI
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Uses llama-cpp-python API for direct quantisation without subprocess overhead.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
self.python_api = LlamaCppPythonAPI()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation using Python API. Since llama-cpp-python is a
required dependency, we can rely on it being available.
Returns:
QuantisationResult with success status and file information.
"""
logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
logger.debug(f"DEBUG: Output path: {output_path}")
# Check input file exists and is readable
if not context.f16_model_path.exists():
error_msg = f"Input model file does not exist: {context.f16_model_path}"
logger.error(f"{error_msg}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=error_msg,
)
# Check if we have enough disk space (rough estimate)
try:
input_size = context.f16_model_path.stat().st_size
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
# This is a rough check - actual available space calculation is more complex
logger.debug(f"DEBUG: Output directory: {output_path.parent}")
except Exception as e:
logger.warning(f"⚠️ Could not check disk space: {e}")
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.debug(f"DEBUG: Source: {context.f16_model_path}")
logger.debug(f"DEBUG: Target: {output_path}")
logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
try:
# Use Python API for quantisation
logger.info("🐍 Using Python API for quantisation...")
logger.debug("DEBUG: Calling python_api.quantise_model...")
success = self.python_api.quantise_model(
context.f16_model_path, output_path, context.config, context.imatrix_path
)
logger.debug(f"DEBUG: Python API returned: {success}")
if success:
logger.debug("DEBUG: Quantisation successful, creating success result")
return self._create_success_result(context.config.name, output_path, "Python API")
logger.error(f"{context.config.name} quantisation failed")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="Quantisation failed via Python API",
)
except Exception as e:
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=f"Exception during quantisation: {e!s}",
)
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)
class ModelManager:
"""Handles model downloading and preparation for quantisation.
Manages both GGUF repository downloads and HuggingFace model conversions,
providing unified interface for model acquisition and preparation.
"""
def __init__(self, models_dir: Path) -> None:
"""Initialise model manager with storage configuration.
Sets up model storage directory for model downloads and conversions.
"""
self.models_dir = models_dir
self.fs = FilesystemService()
def prepare_model(self, model_source: ModelSource) -> Path:
"""Prepare model for quantisation and return F16 model path.
Handles both GGUF repository downloads and regular HuggingFace model
conversion workflows with automatic format detection.
Returns:
Path to F16 GGUF model ready for quantisation.
"""
model_dir = self.models_dir / model_source.model_name
if model_source.is_gguf_repo:
return self._handle_gguf_repo(model_source, model_dir)
return self._handle_regular_repo(model_source, model_dir)
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Handle GGUF repository download with pattern matching.
Downloads GGUF files matching specified patterns, prioritising
multi-part files and F16 variants.
Returns:
Path to downloaded or existing GGUF file.
"""
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
return f16_model
# Check for existing GGUF files
model_dir.mkdir(parents=True, exist_ok=True)
existing_gguf = self.fs.find_gguf_files(model_dir)
if existing_gguf:
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
return existing_gguf[0]
# Download with patterns
downloaded_file = self._download_gguf_with_patterns(
model_source.source_model, model_source.gguf_file_pattern, model_dir
)
if downloaded_file:
# Handle multi-part files
if "00001-of-" in downloaded_file.name:
return downloaded_file
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
"-00003-of-", "-00001-of-"
)
first_part = downloaded_file.parent / base_name
if first_part.exists():
logger.info(f"🔄 Using first part: {first_part.name}")
return first_part
# Rename single file to standard name
downloaded_file.rename(f16_model)
return f16_model
# Fallback to regular conversion
logger.info("💡 Falling back to downloading full repository and converting...")
return self._handle_regular_repo(
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
model_dir,
)
def _download_gguf_with_patterns(
self, source_model: str, pattern: str | None, model_dir: Path
) -> Path | None:
"""Download GGUF file using various pattern strategies.
Tries multiple pattern variations to find and download appropriate
GGUF files, handling timeouts and temporary directories.
Returns:
Path to downloaded file, or None if all patterns fail.
"""
if pattern:
patterns = [
f"*{pattern}*",
f"*{pattern.lower()}*",
f"*{pattern.upper()}*",
"*f16*",
"*F16*",
"*fp16*",
]
else:
patterns = ["*f16*", "*F16*", "*fp16*"]
temp_dir = model_dir / "gguf_temp"
for search_pattern in patterns:
logger.info(f"🔍 Trying pattern: {search_pattern}")
temp_dir.mkdir(exist_ok=True)
try:
logger.debug(
f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
)
result = subprocess.run(
[
"timeout",
"300",
"huggingface-cli",
"download",
source_model,
"--include",
search_pattern,
"--local-dir",
str(temp_dir),
],
check=True,
capture_output=True,
text=True,
)
logger.debug(
f"DEBUG: Download command completed with return code {result.returncode}"
)
# Find downloaded GGUF files
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
if gguf_files:
found_file = gguf_files[0]
logger.info(f"✅ Found GGUF file: {found_file.name}")
# Move to parent directory
final_path = model_dir / found_file.name
shutil.move(str(found_file), str(final_path))
shutil.rmtree(temp_dir)
return final_path
except subprocess.CalledProcessError as e:
logger.debug(
f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
)
if e.stderr:
logger.debug(f"DEBUG: stderr: {e.stderr}")
if e.stdout:
logger.debug(f"DEBUG: stdout: {e.stdout}")
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
continue
except Exception as e:
logger.error(f"❌ Unexpected error during download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
continue
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return None
def _handle_regular_repo(
self,
model_source: ModelSource,
model_dir: Path,
) -> Path:
"""Handle regular HuggingFace repository conversion.
Downloads full model repository and converts to F16 GGUF format
using our native Python-based GGUFConverter for SafeTensors models.
Returns:
Path to converted F16 GGUF model.
"""
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
# Download model if needed
if not model_dir.exists():
self._download_repository(model_source.source_model, model_dir)
else:
logger.info("✅ Model already downloaded")
# Convert to GGUF
return self._convert_to_gguf(model_source, model_dir)
def _download_repository(self, source_model: str, model_dir: Path) -> None:
"""Download HuggingFace repository.
Args:
source_model: HuggingFace model identifier.
model_dir: Local directory for download.
Raises:
RuntimeError: If download fails.
"""
try:
logger.debug(f"DEBUG: Downloading full repository: {source_model}")
result = subprocess.run(
[
"huggingface-cli",
"download",
source_model,
"--local-dir",
str(model_dir),
],
check=True,
capture_output=True,
text=True,
)
logger.debug(
f"DEBUG: Repository download completed with return code {result.returncode}"
)
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to download repository {source_model}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"Repository download failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during repository download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Convert model to GGUF F16 format.
Args:
model_source: Model source information.
model_dir: Directory containing model files.
Returns:
Path to F16 GGUF model.
Raises:
RuntimeError: If conversion fails.
"""
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info("✅ F16 model already exists")
return f16_model
# Check for SafeTensors files
safetensor_files = list(model_dir.glob("*.safetensors"))
if not safetensor_files:
logger.error("❌ Model format not supported")
logger.info("💡 This tool supports GGUF and SafeTensors formats")
msg = "Model must be in GGUF or SafeTensors format"
raise RuntimeError(msg)
logger.info("🐍 Using native Python GGUFConverter...")
logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
# Load model configuration
config_parser = ConfigParser()
model_config = config_parser.load_model_config(model_dir)
# Get architecture mapping
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
arch = config_parser.get_architecture_mapping(arch_name)
if arch != arch_name:
logger.info(f"📝 Architecture mapping: {arch_name}{arch}")
# Convert using GGUFConverter
tensor_mapper = TensorMapper()
success = GGUFConverter.convert_safetensors(
model_dir, f16_model, model_config, arch, tensor_mapper
)
if not success:
logger.error("❌ Native Python conversion failed")
msg = "Failed to convert SafeTensors model to GGUF"
raise RuntimeError(msg)
logger.info("✅ Native Python conversion successful")
return f16_model
class HuggingFaceUploader:
"""Handles uploading models and documentation to HuggingFace.
Provides methods for repository creation, file uploads, and README
updates with proper error handling and retry logic.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Returns:
HuggingFace username from CLI authentication.
Raises:
RuntimeError: If not authenticated.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
Raises:
RuntimeError: If the README upload fails.
"""
logger.info("Uploading README...")
# First ensure the repository exists
self._ensure_repo_exists(output_repo)
# Upload without --create flag to avoid PR creation
try:
logger.debug(f"DEBUG: Uploading README to {output_repo}")
result = subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
"--commit-message",
"Update README.md",
],
check=True,
capture_output=True,
text=True,
)
logger.debug(f"DEBUG: README upload completed with return code {result.returncode}")
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to upload README to {output_repo}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"README upload failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during README upload: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
logger.info("README uploaded")
def _ensure_repo_exists(self, repo_id: str) -> None:
"""Ensure the repository exists, creating it if necessary."""
try:
# Try to create the repo - will fail if it already exists
subprocess.run(
[
"huggingface-cli",
"repo",
"create",
repo_id,
"--type",
"model",
"-y",
],
check=True,
capture_output=True,
text=True,
)
logger.info(f"Created repository: {repo_id}")
except subprocess.CalledProcessError:
# Repository already exists, that's fine
pass
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
"""Upload model file to repository.
Uploads GGUF model file to specified repository path.
Always uses huggingface-cli to ensure proper handling of large files
via HuggingFace's xet backend.
Raises:
RuntimeError: If the model file upload fails.
"""
logger.info(f"Uploading {model_path.name}...")
# Always use huggingface-cli for model files to ensure xet backend is used
try:
logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}")
result = subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(model_path),
model_path.name,
"--revision",
"main", # Explicitly push to main branch
"--commit-message",
f"Add {model_path.name}",
],
check=True,
capture_output=True,
text=True,
)
logger.debug(f"DEBUG: Model upload completed with return code {result.returncode}")
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to upload model file {model_path.name} to {output_repo}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"Model file upload failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during model file upload: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
# Extract and log the URL if present in output
if result.stdout:
for line in result.stdout.splitlines():
if "https://huggingface.co/" in line:
logger.info(f"Upload URL: {line.strip()}")
break
logger.info(f"{model_path.name} uploaded")
def _try_git_upload_file(
self,
repo_id: str,
local_path: Path,
repo_path: str,
*,
create_repo: bool = False,
) -> bool:
"""Try to upload file using git directly to avoid PR creation.
Returns:
bool: True if upload successful, False if should fallback to CLI.
"""
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
repo_url = f"https://huggingface.co/{repo_id}"
# Clone repository
logger.info(f"Cloning {repo_url}...")
result = subprocess.run(
["git", "clone", repo_url, str(temp_path / "repo")],
check=False,
capture_output=True,
text=True,
)
if result.returncode != 0:
if create_repo:
# Repository doesn't exist, let huggingface-cli handle creation
return False
logger.warning(f"Clone failed: {result.stderr}")
return False
repo_dir = temp_path / "repo"
target_file = repo_dir / repo_path
# Ensure target directory exists
target_file.parent.mkdir(parents=True, exist_ok=True)
# Copy file
shutil.copy2(local_path, target_file)
# Check if there are any changes
status_result = subprocess.run(
["git", "status", "--porcelain"],
cwd=repo_dir,
capture_output=True,
text=True,
check=True,
)
if not status_result.stdout.strip():
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
return True # File is already up-to-date, no need to push
# Git add, commit, push
subprocess.run(
["git", "add", repo_path],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "commit", "-m", f"Update {repo_path}"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "push"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
return True
except subprocess.CalledProcessError as e:
logger.warning(f"Git upload failed: {e}")
return False
except Exception as e:
logger.warning(f"Git upload error: {e}")
return False

View file

@ -9,8 +9,8 @@ from __future__ import annotations
from typing import TYPE_CHECKING, Any
from helpers.filesystem import FilesystemService
from helpers.models.conversion import GGUFParameters, ModelConfig, VisionConfig
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from pathlib import Path
@ -107,28 +107,51 @@ class ConfigParser:
@staticmethod
def get_architecture_mapping(architecture: str) -> str:
"""Map architecture names to known GGUF architectures.
"""Get the GGUF architecture name for a model.
Provides fallback mappings for architectures not directly supported
by GGUF format, translating them to similar known architectures. This
enables broader model compatibility whilst maintaining GGUF standards.
Returns the original architecture name to preserve model identity.
Only maps architectures that are truly compatible.
Returns:
GGUF-compatible architecture name with appropriate fallback to llama.
Architecture name for GGUF, preserving original when possible.
"""
# Architecture mappings to known GGUF types
mappings = {
"DotsOCRForCausalLM": "qwen2", # Similar architecture
"GptOssForCausalLM": "llama", # Use llama as fallback
"MistralForCausalLM": "llama", # Mistral is llama-like
"Qwen2ForCausalLM": "qwen2",
# Only map architectures that are ACTUALLY the same
# DO NOT map incompatible architectures
known_compatible = {
"LlamaForCausalLM": "llama",
"MistralForCausalLM": "llama",
"Qwen2ForCausalLM": "qwen2",
"GemmaForCausalLM": "gemma",
"GptOssForCausalLM": "gptoss",
"Phi3ForCausalLM": "phi3",
# Add more mappings as needed
"FalconForCausalLM": "falcon",
"GPT2LMHeadModel": "gpt2",
"GPTJForCausalLM": "gptj",
"GPTNeoXForCausalLM": "gptneox",
"MPTForCausalLM": "mpt",
"BaichuanForCausalLM": "baichuan",
"StableLMEpochForCausalLM": "stablelm",
}
return mappings.get(architecture, "llama") # Default to llama
if architecture in known_compatible:
return known_compatible[architecture]
# For unknown architectures, preserve the original name
# This will make it clear the model needs proper support
# Remove common suffixes to get cleaner architecture name
arch_name = architecture
for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]:
if arch_name.endswith(suffix):
arch_name = arch_name[: -len(suffix)]
break
arch_name = arch_name.lower()
# Special case: convert "gpt-oss" to "gptoss"
if arch_name == "gpt-oss":
arch_name = "gptoss"
return arch_name
@staticmethod
def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
@ -155,11 +178,33 @@ class ConfigParser:
config = fs.load_json_config(tokeniser_config_path)
# Extract token IDs with defaults
# Try to find special token IDs from added_tokens_decoder
added_tokens = config.get("added_tokens_decoder", {})
eos_token_id = config.get("eos_token_id")
bos_token_id = config.get("bos_token_id")
# If not directly specified, search in added_tokens_decoder
if eos_token_id is None:
for token_id, token_info in added_tokens.items():
if token_info.get("content") == "<|endoftext|>":
eos_token_id = int(token_id)
break
if bos_token_id is None:
for token_id, token_info in added_tokens.items():
if token_info.get("content") in {"<|im_start|>", "<s>", "<|startoftext|>"}:
bos_token_id = int(token_id)
break
# Extract token IDs with better defaults
return {
"bos_token_id": config.get("bos_token_id", 1),
"eos_token_id": config.get("eos_token_id", 2),
"bos_token_id": bos_token_id if bos_token_id is not None else 1,
"eos_token_id": eos_token_id if eos_token_id is not None else 2,
"unk_token_id": config.get("unk_token_id", 0),
"pad_token_id": config.get("pad_token_id", 0),
"pad_token_id": config.get(
"pad_token_id", eos_token_id if eos_token_id is not None else 0
),
"model_type": config.get("model_type", "llama"),
"add_bos_token": config.get("add_bos_token", True),
"add_eos_token": config.get("add_eos_token", False),
}

View file

@ -0,0 +1,127 @@
"""Rate limiter for README updates.
Implements a cooldown mechanism to prevent excessive HuggingFace API calls
while ensuring all updates eventually reach the repository.
"""
from __future__ import annotations
import threading
import time
from typing import TYPE_CHECKING, Any
from helpers.logger import logger
if TYPE_CHECKING:
from collections.abc import Callable
class ReadmeRateLimiter:
"""Rate limits README updates to prevent API throttling.
Ensures updates are batched with a minimum interval between API calls,
while guaranteeing that pending updates are eventually applied.
"""
def __init__(self, cooldown_seconds: float = 30.0) -> None:
"""Initialise rate limiter with specified cooldown period.
Sets up the rate limiter with the specified cooldown interval to
prevent excessive API calls whilst ensuring pending updates are
eventually processed through a timer-based batching mechanism.
"""
self.cooldown_seconds = cooldown_seconds
self.last_update_time = 0.0
self.pending_update = False
self.update_lock = threading.Lock()
self.timer: threading.Timer | None = None
self.update_func: Callable[..., Any] | None = None
self.update_args: tuple[Any, ...] | None = None
self.update_kwargs: dict[str, Any] | None = None
def request_update(
self,
update_func: Callable[..., Any],
*args: Any,
**kwargs: Any,
) -> None:
"""Request a README update, respecting rate limits.
Updates are batched during cooldown periods and executed
when the cooldown expires. Stores the update function and its
arguments for deferred execution whilst maintaining thread safety.
"""
with self.update_lock:
current_time = time.time()
time_since_last = current_time - self.last_update_time
# Store the latest update request
self.update_func = update_func
self.update_args = args
self.update_kwargs = kwargs
if time_since_last >= self.cooldown_seconds:
# Enough time has passed, update immediately
logger.debug(f"README update allowed (last update {time_since_last:.1f}s ago)")
self._execute_update()
else:
# Still in cooldown, schedule for later
remaining = self.cooldown_seconds - time_since_last
logger.debug(f"README update delayed ({remaining:.1f}s cooldown remaining)")
if not self.pending_update:
# Schedule an update when cooldown expires
self.pending_update = True
if self.timer:
self.timer.cancel()
self.timer = threading.Timer(remaining, self._delayed_update)
self.timer.start()
else:
# Update already scheduled, just update the args
logger.debug("README update already scheduled, updating with latest data")
def _execute_update(self) -> None:
"""Execute the actual update (must be called with lock held)."""
if self.update_func:
try:
args = self.update_args or ()
kwargs = self.update_kwargs or {}
self.update_func(*args, **kwargs)
self.last_update_time = time.time()
logger.debug("README update completed")
except Exception as e:
logger.error(f"README update failed: {e}")
self.pending_update = False
self.update_func = None
self.update_args = None
self.update_kwargs = None
def _delayed_update(self) -> None:
"""Execute a delayed update after cooldown expires."""
with self.update_lock:
if self.pending_update:
logger.debug("Executing delayed README update")
self._execute_update()
def flush(self) -> None:
"""Force any pending updates to execute immediately.
Called at script end to ensure final state is uploaded.
"""
with self.update_lock:
if self.timer:
self.timer.cancel()
self.timer = None
if self.pending_update and self.update_func:
logger.info("Flushing pending README update...")
# Wait for cooldown if needed
current_time = time.time()
time_since_last = current_time - self.last_update_time
if time_since_last < self.cooldown_seconds:
wait_time = self.cooldown_seconds - time_since_last
logger.info(f"Waiting {wait_time:.1f}s for cooldown before final update...")
time.sleep(wait_time)
self._execute_update()

View file

@ -70,6 +70,8 @@ skip-magic-trailing-comma = false
[tool.ruff.lint]
fixable = ["ALL"]
ignore = [
"ANN002", # type annotation for args
"ANN003", # type annotation for kwargs
"ANN401", # use of Any type
"BLE001", # blind Exception usage
"COM812", # missing trailing comma

View file

@ -17,7 +17,7 @@ import sys
from pathlib import Path
from helpers.logger import logger
from helpers.services.orchestrator import QuantisationOrchestrator
from helpers.quantisation import QuantisationOrchestrator
def main() -> None:

View file

@ -12,8 +12,8 @@ import traceback
from argparse import ArgumentParser
from pathlib import Path
from helpers.gguf import GGUFConverter
from helpers.logger import logger
from helpers.services.gguf import GGUFConverter
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper

40
uv.lock generated
View file

@ -496,26 +496,26 @@ wheels = [
[[package]]
name = "uv"
version = "0.8.6"
version = "0.8.8"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
{ url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
{ url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
{ url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
{ url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
{ url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
{ url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
{ url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
{ url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
{ url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
{ url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
{ url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
{ url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
{ url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
{ url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
{ url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
{ url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
{ url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
{ url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" },
{ url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" },
{ url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" },
{ url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" },
{ url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" },
{ url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" },
{ url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" },
{ url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" },
{ url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" },
{ url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" },
{ url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" },
{ url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" },
{ url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" },
{ url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" },
{ url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" },
{ url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" },
{ url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" },
{ url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" },
]