llm-gguf-tools/helpers/quantisation/engine.py
2025-08-09 17:16:02 +01:00

141 lines
5.3 KiB
Python

"""Quantisation engine for model processing.
Handles the actual quantisation process with configurable methods,
supporting multiple quantisation backends and fallback strategies.
"""
from __future__ import annotations
import traceback
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.ggml import GGMLQuantiser
from helpers.llama_cpp import QuantisationExecutor
from helpers.logger import logger
from helpers.models.quantisation import QuantisationResult, QuantisationType
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import (
QuantisationContext,
)
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Uses direct llama.cpp binary execution with proper tensor overrides.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
self.executor = QuantisationExecutor()
self.ggml_quantiser = GGMLQuantiser()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation using direct llama.cpp binary with proper
tensor override flags for L and XL variants. Falls back to GGML
for basic types when architecture is unsupported. Processes the
quantisation context containing all required parameters and settings.
Returns:
QuantisationResult with success status and file information.
"""
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
# Check input file exists and is readable
if not context.f16_model_path.exists():
error_msg = f"Input model file does not exist: {context.f16_model_path}"
logger.error(f"{error_msg}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=error_msg,
)
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.info(f"📝 Source: {context.f16_model_path}")
logger.info(f"📝 Target: {output_path}")
# Determine if this is a basic type that can use GGML
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
is_basic_type = context.config.name in basic_types
try:
# Try llama.cpp first for all types
logger.info("🔧 Using llama.cpp binary for quantisation...")
success = self.executor.execute_quantisation(
context.f16_model_path, output_path, context.config, context.imatrix_path
)
if success:
return self._create_success_result(context.config.name, output_path, "llama.cpp")
# Check if this was an architecture error and we can use GGML fallback
if (
hasattr(self.executor, "last_error")
and self.executor.last_error == "unsupported_architecture"
and is_basic_type
):
logger.info("🔄 Architecture unsupported - using GGML implementation...")
success = self.ggml_quantiser.try_alternative_quantisation(
context.f16_model_path, output_path, context.config.name
)
if success:
return self._create_success_result(
context.config.name, output_path, "GGML numpy"
)
logger.error(f"{context.config.name} quantisation failed")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="Quantisation failed via Python API",
)
except Exception as e:
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=f"Exception during quantisation: {e!s}",
)
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Constructs a successful quantisation result containing file size
information and method details. Uses the quantisation type, output
path, and method information to generate comprehensive result metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)