llm-gguf-tools/helpers/services/llama_cpp.py

"""llama.cpp environment and operations service.

Manages llama.cpp binary discovery, environment setup, and imatrix generation.
Provides consistent interface for interacting with llama.cpp tools across
different installation methods.
"""

from __future__ import annotations

import subprocess
from pathlib import Path

from helpers.logger import logger
from helpers.models.quantisation import LlamaCppEnvironment
from helpers.services.filesystem import FilesystemService


class EnvironmentManager:
    """Manages llama.cpp environment setup and binary discovery.

    Handles detection of local binaries, repository setup, and conversion
    script location. Provides fallback strategies for different installation
    scenarios including local builds and repository-based setups.
    """

    def __init__(self, work_dir: Path) -> None:
        """Initialise EnvironmentManager."""
        self.work_dir = work_dir
        self.llama_cpp_dir = work_dir / "llama.cpp"
        self.fs = FilesystemService()

    def setup(self) -> LlamaCppEnvironment:
        """Set up llama.cpp environment with automatic detection.

        Checks for local llama.cpp binaries first, then falls back to
        repository-based setup if needed. Handles conversion script location,
        dependency installation, and path resolution.

        Returns:
            Configured LlamaCppEnvironment instance.
        """
        # Check for local binaries first
        local_env = self._check_local_binaries()
        if local_env:
            return local_env

        # Setup repository if needed
        return self.setup_repository()

    def _check_local_binaries(self) -> LlamaCppEnvironment | None:
        """Check for existing llama.cpp binaries in current directory.

        Searches for quantise and CLI binaries in the current directory
        and standard installation paths. Also locates conversion scripts.

        Returns:
            LlamaCppEnvironment if binaries found, None otherwise.
        """
        quantise_bin = Path("./llama-quantize")
        cli_bin = Path("./llama-cli")

        if not (quantise_bin.exists() and cli_bin.exists()):
            return None

        logger.info("Found llama.cpp binaries in current directory")

        # Check for conversion script
        convert_script = self._find_convert_script()
        if convert_script:
            logger.info(f"Found conversion script: {convert_script}")
            return LlamaCppEnvironment(
                quantise_binary=quantise_bin.resolve(),
                cli_binary=cli_bin.resolve(),
                convert_script=convert_script,
                use_repo=False,
            )

        logger.warning("No conversion script found in current directory")
        logger.info("Will use llama.cpp repository method for conversion")
        return LlamaCppEnvironment(
            quantise_binary=quantise_bin.resolve(),
            cli_binary=cli_bin.resolve(),
            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
            use_repo=True,
        )

    def _find_convert_script(self) -> str | None:
        """Find conversion script in current directory.

        Searches for various naming conventions of the HF to GGUF
        conversion script.

        Returns:
            Command to run conversion script, or None if not found.
        """
        scripts = [
            "./llama-convert-hf-to-gguf",
            "python3 ./convert_hf_to_gguf.py",
            "python3 ./convert-hf-to-gguf.py",
        ]

        for script in scripts:
            if script.startswith("python3"):
                script_path = script.split(" ", 1)[1]
                if Path(script_path).exists():
                    return script
            elif Path(script).exists():
                return script
        return None

    def setup_repository(self) -> LlamaCppEnvironment:
        """Setup llama.cpp repository for conversion scripts.

        Clones the llama.cpp repository if not present and installs
        Python dependencies for model conversion.

        Returns:
            LlamaCppEnvironment configured with repository paths.
        """
        if not self.llama_cpp_dir.exists():
            logger.info("Cloning llama.cpp for conversion script...")
            subprocess.run(
                [
                    "git",
                    "clone",
                    "https://github.com/ggerganov/llama.cpp.git",
                    str(self.llama_cpp_dir),
                ],
                check=True,
            )

            # Install Python requirements
            logger.info("Installing Python requirements...")
            subprocess.run(
                [
                    "pip3",
                    "install",
                    "-r",
                    "requirements.txt",
                    "--break-system-packages",
                    "--root-user-action=ignore",
                ],
                cwd=self.llama_cpp_dir,
                check=True,
            )

            # Install additional conversion dependencies
            logger.info("Installing additional conversion dependencies...")
            subprocess.run(
                [
                    "pip3",
                    "install",
                    "transformers",
                    "sentencepiece",
                    "protobuf",
                    "--break-system-packages",
                    "--root-user-action=ignore",
                ],
                check=True,
            )
        else:
            logger.info("llama.cpp repository already exists")

        # Use local binaries but repo conversion script
        return LlamaCppEnvironment(
            quantise_binary=Path("./llama-quantize").resolve(),
            cli_binary=Path("./llama-cli").resolve(),
            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
            use_repo=False,
        )


class IMatrixGenerator:
    """Handles importance matrix generation for quantisation guidance.

    Generates or locates importance matrices that guide quantisation
    decisions, helping preserve model quality by identifying critical
    tensors requiring higher precision.
    """

    def __init__(self) -> None:
        """Initialise IMatrixGenerator."""
        self.fs = FilesystemService()

    def generate_imatrix(
        self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
    ) -> Path | None:
        """Generate importance matrix for quantisation guidance.

        Searches for existing imatrix files first, provides interactive
        prompts for user-supplied matrices, then generates new matrices
        using calibration data if necessary.

        Returns:
            Path to imatrix file, or None if generation fails.
        """
        imatrix_path = model_dir / "imatrix.dat"

        # Check for existing imatrix
        if imatrix_path.exists():
            logger.info(f"Found existing imatrix: {imatrix_path.name}")
            return imatrix_path

        # Try user-provided imatrix
        user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
        if user_imatrix:
            return user_imatrix

        # Generate new imatrix
        calibration_file = self._get_calibration_file()
        if not calibration_file:
            return None

        return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)

    def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
        """Prompt user for existing imatrix file.

        Returns:
            Path to user-provided imatrix, or None if not available.
        """
        logger.info(f"Model directory: {model_dir}")
        logger.info(f"Looking for imatrix file at: {imatrix_path}")
        logger.info(
            "Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
        )
        logger.info(
            "   Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
        )

        response = (
            input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
            .strip()
            .lower()
        )

        if response != "y":
            return None

        logger.info(f"Please place your imatrix.dat file in: {model_dir}")
        input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")

        if imatrix_path.exists():
            file_size = self.fs.get_file_size(imatrix_path)
            logger.info(f"Found imatrix file! ({file_size})")
            return imatrix_path

        logger.warning("No imatrix.dat file found - continuing with automatic generation")
        return None

    def _get_calibration_file(self) -> Path | None:
        """Get calibration data file for imatrix generation.

        Returns:
            Path to calibration file, or None if not found.
        """
        calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
        if not calibration_file.exists():
            logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
            logger.info(
                "Download from: https://gist.githubusercontent.com/bartowski1182/"
                "eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
            )
            return None
        return calibration_file

    def _generate_new_imatrix(
        self,
        f16_model_path: Path,
        llama_env: LlamaCppEnvironment,
        imatrix_path: Path,
        calibration_file: Path,
    ) -> Path | None:
        """Generate new importance matrix using calibration data.

        Returns:
            Path to generated imatrix, or None if generation fails.
        """
        logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
        logger.info(f"Model: {f16_model_path.name}")
        logger.info(f"Calibration: {calibration_file}")
        logger.info(f"Output: {imatrix_path}")

        # Find imatrix binary
        imatrix_binary = self._find_imatrix_binary(llama_env)
        if not imatrix_binary:
            logger.warning("llama-imatrix binary not found - skipping imatrix generation")
            logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
            return None

        # Build and execute command
        cmd = self._build_imatrix_command(
            imatrix_binary, f16_model_path, calibration_file, imatrix_path
        )
        return self._execute_imatrix_generation(cmd, imatrix_path)

    def _build_imatrix_command(
        self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
    ) -> list[str]:
        """Build imatrix generation command.

        Returns:
            Command arguments as list.
        """
        return [
            str(binary),
            "-m",
            str(model_path),
            "-f",
            str(calibration_file),
            "-o",
            str(output_path),
            "--process-output",
            "--output-frequency",
            "10",
            "--save-frequency",
            "50",
            "-t",
            "8",
            "-c",
            "2048",
            "-b",
            "512",
        ]

    def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
        """Execute imatrix generation command with real-time output.

        Returns:
            Path to generated imatrix file, or None if generation fails.
        """
        logger.info(f"Running: {' '.join(cmd)}")
        logger.info("Starting imatrix generation... (progress will be shown)")

        try:
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1,
            )

            self._stream_imatrix_output(process)

            return_code = process.poll()
            if return_code == 0:
                return self._validate_imatrix_output(imatrix_path)

        except KeyboardInterrupt:
            logger.info("imatrix generation cancelled by user")
            process.terminate()
            return None
        except Exception as e:
            logger.error(f"imatrix generation failed with exception: {e}")
            return None
        else:
            logger.error(f"imatrix generation failed with return code {return_code}")
            return None

    def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
        """Stream imatrix generation output in real-time."""
        while True:
            if process.stdout is not None:
                output = process.stdout.readline()
            else:
                break
            if not output and process.poll() is not None:
                break
            if output:
                line = output.strip()
                if self._should_log_imatrix_line(line):
                    logger.info(line)

    def _should_log_imatrix_line(self, line: str) -> bool:
        """Determine if imatrix output line should be logged.

        Returns:
            True if line should be logged, False otherwise.
        """
        keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
        return any(keyword in line for keyword in keywords) or line.startswith("[")

    def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
        """Validate generated imatrix file.

        Returns:
            Path to imatrix if valid, None otherwise.
        """
        if imatrix_path.exists():
            file_size = self.fs.get_file_size(imatrix_path)
            logger.info(f"imatrix generation successful! ({file_size})")
            return imatrix_path
        logger.error("imatrix generation completed but file not found")
        return None

    def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
        """Find llama-imatrix binary in common locations.

        Searches for the imatrix binary in the current directory and
        standard installation paths.

        Returns:
            Path to imatrix binary, or None if not found.
        """
        candidates = [
            Path("./llama-imatrix"),
            llama_env.quantise_binary.parent / "llama-imatrix",
            Path("/usr/local/bin/llama-imatrix"),
            Path("/usr/bin/llama-imatrix"),
        ]

        for candidate in candidates:
            if candidate.exists() and candidate.is_file():
                return candidate

        return None