llm-gguf-tools/helpers/quantisation/engine.py

"""Quantisation engine for model processing.

Handles the actual quantisation process with configurable methods,
supporting multiple quantisation backends and fallback strategies.
"""

from __future__ import annotations

import traceback
from typing import TYPE_CHECKING

from helpers.filesystem import FilesystemService
from helpers.ggml import GGMLQuantiser
from helpers.llama_cpp import QuantisationExecutor
from helpers.logger import logger
from helpers.models.quantisation import QuantisationResult, QuantisationType

if TYPE_CHECKING:
    from pathlib import Path

    from helpers.models.quantisation import (
        QuantisationContext,
    )


class QuantisationEngine:
    """Handles the actual quantisation process with configurable methods.

    Provides flexible quantisation execution supporting multiple tensor
    precision configurations, importance matrices, and fallback strategies.
    Uses direct llama.cpp binary execution with proper tensor overrides.
    """

    def __init__(self) -> None:
        """Initialise quantisation engine."""
        self.fs = FilesystemService()
        self.executor = QuantisationExecutor()
        self.ggml_quantiser = GGMLQuantiser()

    def quantise(self, context: QuantisationContext) -> QuantisationResult:
        """Perform quantisation using the specified configuration.

        Executes quantisation using direct llama.cpp binary with proper
        tensor override flags for L and XL variants. Falls back to GGML
        for basic types when architecture is unsupported. Processes the
        quantisation context containing all required parameters and settings.

        Returns:
            QuantisationResult with success status and file information.
        """
        logger.info(
            f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
        )

        output_path = context.get_output_path()

        # Check input file exists and is readable
        if not context.f16_model_path.exists():
            error_msg = f"Input model file does not exist: {context.f16_model_path}"
            logger.error(f"❌ {error_msg}")
            return QuantisationResult(
                quantisation_type=QuantisationType(context.config.name),
                success=False,
                error_message=error_msg,
            )

        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
        logger.info(f"📝 Source: {context.f16_model_path}")
        logger.info(f"📝 Target: {output_path}")

        # Determine if this is a basic type that can use GGML
        basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
        is_basic_type = context.config.name in basic_types

        try:
            # Try llama.cpp first for all types
            logger.info("🔧 Using llama.cpp binary for quantisation...")

            success = self.executor.execute_quantisation(
                context.f16_model_path, output_path, context.config, context.imatrix_path
            )

            if success:
                return self._create_success_result(context.config.name, output_path, "llama.cpp")

            # Check if this was an architecture error and we can use GGML fallback
            if (
                hasattr(self.executor, "last_error")
                and self.executor.last_error == "unsupported_architecture"
                and is_basic_type
            ):
                logger.info("🔄 Architecture unsupported - using GGML implementation...")

                success = self.ggml_quantiser.try_alternative_quantisation(
                    context.f16_model_path, output_path, context.config.name
                )

                if success:
                    return self._create_success_result(
                        context.config.name, output_path, "GGML numpy"
                    )

            logger.error(f"❌ {context.config.name} quantisation failed")
            return QuantisationResult(
                quantisation_type=QuantisationType(context.config.name),
                success=False,
                error_message="Quantisation failed via Python API",
            )

        except Exception as e:
            logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
            logger.error("Exception traceback:")
            for line in traceback.format_exc().splitlines():
                logger.error(f"  {line}")

            return QuantisationResult(
                quantisation_type=QuantisationType(context.config.name),
                success=False,
                error_message=f"Exception during quantisation: {e!s}",
            )

    def _create_success_result(
        self, quant_type: str, output_path: Path, method_used: str
    ) -> QuantisationResult:
        """Create successful quantisation result with file metadata.

        Constructs a successful quantisation result containing file size
        information and method details. Uses the quantisation type, output
        path, and method information to generate comprehensive result metadata.

        Returns:
            QuantisationResult with file path and size information.
        """
        file_size = self.fs.get_file_size(output_path)
        return QuantisationResult(
            quantisation_type=QuantisationType(quant_type),
            success=True,
            file_path=output_path,
            file_size=file_size,
            method_used=method_used,
        )