llm-gguf-tools/helpers/llama_cpp/quantiser.py

"""Direct llama.cpp quantisation execution.

Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""

from __future__ import annotations

import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING

from helpers.filesystem import FilesystemService
from helpers.llama_cpp.binary_manager import BinaryManager
from helpers.logger import logger

if TYPE_CHECKING:
    from helpers.models.quantisation import QuantisationConfig


class QuantisationExecutor:
    """Executes llama.cpp quantisation with tensor overrides.

    Provides direct binary execution with proper command-line flags for
    tensor-specific overrides, supporting Bartowski-style L and XL variants.
    """

    def __init__(self) -> None:
        """Initialise quantisation executor."""
        self.fs = FilesystemService()
        self.binary_manager = BinaryManager()
        self.quantise_binary = self._get_quantise_binary()
        self.last_error: str | None = None  # Track last error type

    def _get_quantise_binary(self) -> Path | None:
        """Get llama-quantize binary, downloading if necessary.

        Returns:
            Path to binary if found, None otherwise.
        """
        # First check local directory for manual placement
        local_binary = Path("./llama-quantize")
        if local_binary.exists():
            logger.info(f"Using local llama-quantize binary: {local_binary}")
            return local_binary

        # Download from GitHub releases
        binary_path = self.binary_manager.get_quantise_binary()
        if binary_path and self.binary_manager.check_binary_works(binary_path):
            logger.info(f"Using llama-quantize binary: {binary_path}")
            return binary_path

        logger.error("Failed to obtain llama-quantize binary")
        logger.info(
            "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
        )
        return None

    def execute_quantisation(
        self,
        input_path: Path,
        output_path: Path,
        config: QuantisationConfig,
        imatrix_path: Path | None = None,
    ) -> bool:
        """Execute quantisation using llama.cpp binary.

        Builds and executes llama-quantize command with proper tensor override
        flags for L and XL variants.

        Returns:
            True if quantisation successful, False otherwise.
        """
        if not self.quantise_binary:
            logger.error("llama-quantize binary not available")
            return False

        # Build command
        cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)

        # Execute with real-time output
        return self._execute_command(cmd)

    def _build_quantisation_command(
        self,
        input_path: Path,
        output_path: Path,
        config: QuantisationConfig,
        imatrix_path: Path | None,
    ) -> list[str]:
        """Build llama-quantize command with tensor overrides.

        Returns:
            Command arguments as list.
        """
        cmd = [str(self.quantise_binary)]

        # Add imatrix if available
        if imatrix_path:
            cmd.extend(["--imatrix", str(imatrix_path)])

        # Add tensor overrides for L and XL variants
        if config.output_type:
            cmd.extend(["--output-tensor-type", config.output_type])
        if config.embedding_type:
            cmd.extend(["--token-embedding-type", config.embedding_type])

        # Add input, output, and quantisation type
        cmd.extend([str(input_path), str(output_path), config.base_type])

        return cmd

    def _setup_environment(self) -> dict[str, str]:
        """Set up environment variables for quantisation command.

        Returns:
            Environment dictionary with necessary library paths.
        """
        env = os.environ.copy()
        if platform.system() != "Windows":
            lib_path = str(self.binary_manager.BINARY_DIR)
            if "LD_LIBRARY_PATH" in env:
                env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
            else:
                env["LD_LIBRARY_PATH"] = lib_path
        return env

    def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]:
        """Process subprocess output stream and detect errors.

        Returns:
            Tuple of (output_lines, architecture_error_detected).
        """
        output_lines = []
        architecture_error = False

        if process.stdout:
            for line in iter(process.stdout.readline, ""):
                if line:
                    cleaned_line = line.rstrip()
                    output_lines.append(cleaned_line)
                    logger.info(f"  {cleaned_line}")

                    # Check for architecture errors
                    if any(
                        error_text in cleaned_line.lower()
                        for error_text in [
                            "unknown model architecture",
                            "unsupported architecture",
                            "unknown architecture",
                            "architecture not supported",
                            "model architecture",
                            "llama_model_load: error loading model",
                        ]
                    ):
                        architecture_error = True

        return output_lines, architecture_error

    def _handle_architecture_error(self, output_lines: list[str]) -> bool:
        """Handle architecture-related errors by checking output.

        Returns:
            True if architecture error was detected and handled.
        """
        # Look for architecture info in recent output
        for line in output_lines[-10:]:  # Check last 10 lines
            if "architecture" in line.lower():
                logger.error("❌ Architecture not supported by llama.cpp")
                logger.error("   so cannot be quantised with current llama.cpp but")
                logger.error("   F16 GGUF file can be used for inference if supported")
                # Store this for the orchestrator to detect
                self.last_error = "unsupported_architecture"
                return True
        return False

    def _execute_command(self, cmd: list[str]) -> bool:
        """Execute command with real-time output streaming.

        Returns:
            True if successful, False otherwise.
        """
        try:
            logger.info(f"🔧 Executing: {' '.join(cmd)}")

            env = self._setup_environment()

            # Execute with real-time output
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                bufsize=1,
                universal_newlines=True,
                env=env,
            )

            output_lines, architecture_error = self._process_output_stream(process)

            return_code = process.poll()
            if return_code == 0:
                logger.info("✅ Quantisation successful!")
                return True

            # Check if this was an architecture error
            if (architecture_error or return_code == 1) and self._handle_architecture_error(
                output_lines
            ):
                return False

            logger.error(f"❌ Quantisation failed with return code {return_code}")

        except Exception as e:
            logger.error(f"❌ Quantisation failed with exception: {e}")

        return False