llm-gguf-tools/helpers/services/llama_cpp.py

"""Direct llama.cpp binary execution service.

Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""

from __future__ import annotations

import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING

from helpers.logger import logger
from helpers.services.binary_manager import BinaryManager
from helpers.services.filesystem import FilesystemService

if TYPE_CHECKING:
    from helpers.models.quantisation import QuantisationConfig


class QuantisationExecutor:
    """Executes llama.cpp quantisation with tensor overrides.

    Provides direct binary execution with proper command-line flags for
    tensor-specific overrides, supporting Bartowski-style L and XL variants.
    """

    def __init__(self) -> None:
        """Initialise quantisation executor."""
        self.fs = FilesystemService()
        self.binary_manager = BinaryManager()
        self.quantise_binary = self._get_quantise_binary()
        self.last_error: str | None = None  # Track last error type

    def _get_quantise_binary(self) -> Path | None:
        """Get llama-quantize binary, downloading if necessary.

        Returns:
            Path to binary if found, None otherwise.
        """
        # First check local directory for manual placement
        local_binary = Path("./llama-quantize")
        if local_binary.exists():
            logger.info(f"Using local llama-quantize binary: {local_binary}")
            return local_binary

        # Download from GitHub releases
        binary_path = self.binary_manager.get_quantise_binary()
        if binary_path and self.binary_manager.check_binary_works(binary_path):
            logger.info(f"Using llama-quantize binary: {binary_path}")
            return binary_path

        logger.error("Failed to obtain llama-quantize binary")
        logger.info(
            "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
        )
        return None

    def execute_quantisation(
        self,
        input_path: Path,
        output_path: Path,
        config: QuantisationConfig,
        imatrix_path: Path | None = None,
    ) -> bool:
        """Execute quantisation using llama.cpp binary.

        Builds and executes llama-quantize command with proper tensor override
        flags for L and XL variants.

        Returns:
            True if quantisation successful, False otherwise.
        """
        if not self.quantise_binary:
            logger.error("llama-quantize binary not available")
            return False

        # Build command
        cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)

        # Execute with real-time output
        return self._execute_command(cmd)

    def _build_quantisation_command(
        self,
        input_path: Path,
        output_path: Path,
        config: QuantisationConfig,
        imatrix_path: Path | None,
    ) -> list[str]:
        """Build llama-quantize command with tensor overrides.

        Returns:
            Command arguments as list.
        """
        cmd = [str(self.quantise_binary)]

        # Add imatrix if available
        if imatrix_path:
            cmd.extend(["--imatrix", str(imatrix_path)])
            if imatrix_path.exists():
                logger.info(f"🧮 Using imatrix: {imatrix_path.name}")

        # Add tensor-specific overrides for L and XL variants
        if config.embedding_type:
            # Use directly from config - already in correct format
            cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
            logger.info(f"⚙️ Token embedding type: {config.embedding_type}")

        if config.output_type:
            # Use directly from config - already in correct format
            cmd.extend(["--output-tensor-type", config.output_type.lower()])
            logger.info(f"⚙️ Output tensor type: {config.output_type}")

        # Note: Per-layer tensor overrides could be added here if needed in future
        # For now, embedding and output overrides handle the L/XL variants

        # Get base quantisation type
        base_quant = self._get_base_quantisation_type(config.name)

        # Add input, output, and base quantisation type
        cmd.extend([str(input_path), str(output_path), base_quant])

        return cmd

    def _get_base_quantisation_type(self, config_name: str) -> str:
        """Get base quantisation type for a config.

        Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).

        Returns:
            Base quantisation type string.
        """
        # Mapping of custom variants to base types
        variant_mapping = {
            "Q3_K_L": "Q3_K_M",
            "Q3_K_XL": "Q3_K_M",
            "Q4_K_L": "Q4_K_M",
            "Q4_K_XL": "Q4_K_M",
            "Q5_K_L": "Q5_K_M",
            "Q5_K_XL": "Q5_K_M",
            "Q6_K_L": "Q6_K",
            "Q6_K_XL": "Q6_K",
        }

        return variant_mapping.get(config_name, config_name)

    def _execute_command(self, cmd: list[str]) -> bool:
        """Execute command with real-time output streaming.

        Returns:
            True if successful, False otherwise.
        """
        logger.info(f"💻 Running: {' '.join(cmd)}")
        logger.info("⏳ Quantisation in progress... (this may take several minutes)")

        # Set LD_LIBRARY_PATH for shared libraries
        env = os.environ.copy()
        if platform.system() != "Windows":
            lib_path = str(self.binary_manager.BINARY_DIR)
            if "LD_LIBRARY_PATH" in env:
                env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
            else:
                env["LD_LIBRARY_PATH"] = lib_path

        # Track output for architecture detection
        output_lines = []
        architecture_error = False

        try:
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1,
                env=env,
            )

            # Stream output
            while True:
                if process.stdout is not None:
                    output = process.stdout.readline()
                else:
                    break
                if not output and process.poll() is not None:
                    break
                if output:
                    output_stripped = output.strip()
                    logger.info(f"📊 {output_stripped}")
                    output_lines.append(output_stripped)

                    # Check for architecture-related errors
                    if any(
                        phrase in output_stripped.lower()
                        for phrase in [
                            "unsupported architecture",
                            "unknown architecture",
                            "architecture not supported",
                            "model architecture",
                            "llama_model_load: error loading model",
                        ]
                    ):
                        architecture_error = True

            return_code = process.poll()
            if return_code == 0:
                logger.info("✅ Quantisation successful!")
                return True

            # Check if this was an architecture error
            if architecture_error or return_code == 1:
                # Look for architecture info in recent output
                for line in output_lines[-10:]:  # Check last 10 lines
                    if "architecture" in line.lower():
                        logger.error("❌ Architecture not supported by llama.cpp")
                        logger.error("   so cannot be quantised with current llama.cpp but")
                        logger.error("   F16 GGUF file can be used for inference if supported")
                        # Store this for the orchestrator to detect
                        self.last_error = "unsupported_architecture"
                        return False

            logger.error(f"❌ Quantisation failed with return code {return_code}")

        except Exception as e:
            logger.error(f"❌ Quantisation failed with exception: {e}")
            return False
        else:
            return False


class IMatrixHandler:
    """Handles importance matrix file management.

    Manages detection and use of existing importance matrix files for
    quantisation guidance.
    """

    def __init__(self) -> None:
        """Initialise IMatrixHandler."""
        self.fs = FilesystemService()

    def find_imatrix(self, model_dir: Path) -> Path | None:
        """Find existing imatrix file in model directory.

        Returns:
            Path to imatrix file if found, None otherwise.
        """
        imatrix_path = model_dir / "imatrix.dat"

        if imatrix_path.exists():
            file_size = self.fs.get_file_size(imatrix_path)
            logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
            return imatrix_path

        return None

    def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
        """Prompt user for existing imatrix file.

        Returns:
            Path to user-provided imatrix, or None if not available.
        """
        imatrix_path = model_dir / "imatrix.dat"

        logger.info(f"Model directory: {model_dir}")
        logger.info(f"Looking for imatrix file at: {imatrix_path}")
        logger.info(
            "Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
        )
        logger.info(
            "   Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
        )

        response = (
            input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
            .strip()
            .lower()
        )

        if response != "y":
            return None

        logger.info(f"Please place your imatrix.dat file in: {model_dir}")
        input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")

        if imatrix_path.exists():
            file_size = self.fs.get_file_size(imatrix_path)
            logger.info(f"Found imatrix file! ({file_size})")
            return imatrix_path

        logger.warning("No imatrix.dat file found - continuing without imatrix")
        return None