llm-gguf-tools/helpers/services/quantisation.py

"""Quantisation operations service.

Provides modular quantisation engine, model management, and upload capabilities
for GGUF model processing. Consolidates quantisation logic from various tools
into reusable components following SOLID principles.
"""

from __future__ import annotations

import shutil
import subprocess
from typing import TYPE_CHECKING

from helpers.logger import logger
from helpers.models.quantisation import (
    ModelSource,
    QuantisationContext,
    QuantisationResult,
    QuantisationType,
)
from helpers.services.filesystem import FilesystemService

if TYPE_CHECKING:
    from pathlib import Path

    from helpers.models.quantisation import LlamaCppEnvironment
    from helpers.services.llama_cpp import EnvironmentManager


class QuantisationEngine:
    """Handles the actual quantisation process with configurable methods.

    Provides flexible quantisation execution supporting multiple tensor
    precision configurations, importance matrices, and fallback strategies.
    Encapsulates llama-quantize binary interactions with real-time output.
    """

    def __init__(self) -> None:
        """Initialise quantisation engine."""
        self.fs = FilesystemService()

    def quantise(self, context: QuantisationContext) -> QuantisationResult:
        """Perform quantisation using the specified configuration.

        Executes quantisation with primary and fallback methods, handling
        tensor-specific precision overrides and importance matrix guidance.

        Returns:
            QuantisationResult with success status and file information.
        """
        logger.info(
            f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
        )

        output_path = context.get_output_path()

        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
        logger.info(f"📝 Source: {context.f16_model_path}")
        logger.info(f"📝 Target: {output_path}")

        # Try primary method
        if self._try_quantisation_method(
            context, output_path, context.config.tensor_types, "method 1"
        ):
            return self._create_success_result(context.config.name, output_path, "method 1")

        # Try fallback methods
        for i, fallback_method in enumerate(context.config.fallback_methods, 2):
            method_name = f"method {i}"
            if self._try_quantisation_method(context, output_path, fallback_method, method_name):
                return self._create_success_result(context.config.name, output_path, method_name)

        logger.error("All %s quantisation methods failed", context.config.name)
        return QuantisationResult(
            quantisation_type=QuantisationType(context.config.name),
            success=False,
            error_message="All quantisation methods failed",
        )

    def _try_quantisation_method(
        self,
        context: QuantisationContext,
        output_path: Path,
        tensor_config: dict[str, str],
        method_name: str,
    ) -> bool:
        """Try a specific quantisation method with real-time output.

        Builds and executes llama-quantize command with appropriate parameters,
        streaming output for progress monitoring.

        Returns:
            True if quantisation successful, False otherwise.
        """
        logger.info(f"🔍 Trying {method_name}...")

        cmd = self._build_quantisation_command(context, output_path, tensor_config)
        return self._execute_quantisation_command(cmd, method_name)

    def _build_quantisation_command(
        self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
    ) -> list[str]:
        """Build quantisation command with all required parameters.

        Returns:
            List of command arguments.
        """
        cmd = [str(context.llama_env.quantise_binary)]

        # Add imatrix if available
        if context.imatrix_path and context.imatrix_path.exists():
            cmd.extend(["--imatrix", str(context.imatrix_path)])
            logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")

        # Add tensor type arguments
        self._add_tensor_type_arguments(cmd, tensor_config)

        cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
        return cmd

    def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
        """Add tensor type arguments to command."""
        if not tensor_config:
            return

        for tensor_name, quant_type in tensor_config.items():
            if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
                cmd.extend([f"--{tensor_name}", quant_type])
            else:
                cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])

    def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
        """Execute quantisation command with real-time output.

        Returns:
            True if quantisation successful, False otherwise.
        """
        logger.info(f"💻 Running: {' '.join(cmd)}")
        logger.info("⏳ Quantisation in progress... (this may take several minutes)")

        try:
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                universal_newlines=True,
                bufsize=1,
            )

            self._stream_quantisation_output(process)

            return_code = process.poll()
            if return_code == 0:
                logger.info(f"✅ {method_name} quantisation successful!")
                return True
        except Exception as e:
            logger.info(f"❌ {method_name} failed with exception: {e}")
            return False
        else:
            logger.info(f"❌ {method_name} failed with return code {return_code}")
            return False

    def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
        """Stream quantisation output in real-time."""
        while True:
            if process.stdout is not None:
                output = process.stdout.readline()
            else:
                break
            if not output and process.poll() is not None:
                break
            if output:
                logger.info(f"📊 {output.strip()}")

    def _create_success_result(
        self, quant_type: str, output_path: Path, method_used: str
    ) -> QuantisationResult:
        """Create successful quantisation result with file metadata.

        Returns:
            QuantisationResult with file path and size information.
        """
        file_size = self.fs.get_file_size(output_path)
        return QuantisationResult(
            quantisation_type=QuantisationType(quant_type),
            success=True,
            file_path=output_path,
            file_size=file_size,
            method_used=method_used,
        )


class ModelManager:
    """Handles model downloading and preparation for quantisation.

    Manages both GGUF repository downloads and HuggingFace model conversions,
    providing unified interface for model acquisition and preparation.
    """

    def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
        """Initialise model manager with storage and environment configuration.

        Sets up model storage directory and links to environment manager for
        conversion script access and llama.cpp tool discovery.
        """
        self.models_dir = models_dir
        self.environment_manager = environment_manager
        self.fs = FilesystemService()

    def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
        """Prepare model for quantisation and return F16 model path.

        Handles both GGUF repository downloads and regular HuggingFace model
        conversion workflows with automatic format detection.

        Returns:
            Path to F16 GGUF model ready for quantisation.
        """
        model_dir = self.models_dir / model_source.model_name

        if model_source.is_gguf_repo:
            return self._handle_gguf_repo(model_source, model_dir)
        return self._handle_regular_repo(model_source, model_dir, llama_env)

    def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
        """Handle GGUF repository download with pattern matching.

        Downloads GGUF files matching specified patterns, prioritising
        multi-part files and F16 variants.

        Returns:
            Path to downloaded or existing GGUF file.
        """
        logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
        logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")

        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"

        if f16_model.exists():
            logger.info(f"✅ Found existing F16 file: {f16_model.name}")
            return f16_model

        # Check for existing GGUF files
        model_dir.mkdir(parents=True, exist_ok=True)
        existing_gguf = self.fs.find_gguf_files(model_dir)

        if existing_gguf:
            logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
            return existing_gguf[0]

        # Download with patterns
        downloaded_file = self._download_gguf_with_patterns(
            model_source.source_model, model_source.gguf_file_pattern, model_dir
        )

        if downloaded_file:
            # Handle multi-part files
            if "00001-of-" in downloaded_file.name:
                return downloaded_file
            if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
                base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
                    "-00003-of-", "-00001-of-"
                )
                first_part = downloaded_file.parent / base_name
                if first_part.exists():
                    logger.info(f"🔄 Using first part: {first_part.name}")
                    return first_part

            # Rename single file to standard name
            downloaded_file.rename(f16_model)
            return f16_model

        # Fallback to regular conversion
        logger.info("💡 Falling back to downloading full repository and converting...")
        return self._handle_regular_repo(
            ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
            model_dir,
            None,
        )

    def _download_gguf_with_patterns(
        self, source_model: str, pattern: str | None, model_dir: Path
    ) -> Path | None:
        """Download GGUF file using various pattern strategies.

        Tries multiple pattern variations to find and download appropriate
        GGUF files, handling timeouts and temporary directories.

        Returns:
            Path to downloaded file, or None if all patterns fail.
        """
        if pattern:
            patterns = [
                f"*{pattern}*",
                f"*{pattern.lower()}*",
                f"*{pattern.upper()}*",
                "*f16*",
                "*F16*",
                "*fp16*",
            ]
        else:
            patterns = ["*f16*", "*F16*", "*fp16*"]

        temp_dir = model_dir / "gguf_temp"

        for search_pattern in patterns:
            logger.info(f"🔍 Trying pattern: {search_pattern}")
            temp_dir.mkdir(exist_ok=True)

            try:
                subprocess.run(
                    [
                        "timeout",
                        "300",
                        "huggingface-cli",
                        "download",
                        source_model,
                        "--include",
                        search_pattern,
                        "--local-dir",
                        str(temp_dir),
                    ],
                    check=True,
                    capture_output=True,
                )

                # Find downloaded GGUF files
                gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
                if gguf_files:
                    found_file = gguf_files[0]
                    logger.info(f"✅ Found GGUF file: {found_file.name}")

                    # Move to parent directory
                    final_path = model_dir / found_file.name
                    shutil.move(str(found_file), str(final_path))
                    shutil.rmtree(temp_dir)
                    return final_path

            except subprocess.CalledProcessError:
                logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
                continue
            finally:
                if temp_dir.exists():
                    shutil.rmtree(temp_dir, ignore_errors=True)

        return None

    def _handle_regular_repo(
        self,
        model_source: ModelSource,
        model_dir: Path,
        llama_env: LlamaCppEnvironment | None,
    ) -> Path:
        """Handle regular HuggingFace repository conversion.

        Downloads full model repository and converts to F16 GGUF format
        using llama.cpp conversion scripts.

        Returns:
            Path to converted F16 GGUF model.
        """
        logger.info(f"⬇️ Downloading source model: {model_source.source_model}")

        if not model_dir.exists():
            subprocess.run(
                [
                    "huggingface-cli",
                    "download",
                    model_source.source_model,
                    "--local-dir",
                    str(model_dir),
                ],
                check=True,
            )
        else:
            logger.info("✅ Model already downloaded")

        logger.info("🔄 Converting to GGUF F16 format...")
        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"

        if not f16_model.exists():
            if not llama_env:
                llama_env = self.environment_manager.setup()

            # Ensure conversion script is available
            if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
                logger.info("Getting conversion script from llama.cpp repository...")
                llama_env = self.environment_manager.setup_repository()

            subprocess.run(
                [
                    *llama_env.convert_script.split(),
                    str(model_dir),
                    "--outtype",
                    "f16",
                    "--outfile",
                    str(f16_model),
                ],
                check=True,
            )
        else:
            logger.info("✅ F16 model already exists")

        return f16_model


class HuggingFaceUploader:
    """Handles uploading models and documentation to HuggingFace.

    Provides methods for repository creation, file uploads, and README
    updates with proper error handling and retry logic.
    """

    @staticmethod
    def get_username() -> str:
        """Get authenticated HuggingFace username.

        Returns:
            HuggingFace username from CLI authentication.

        Raises:
            RuntimeError: If not authenticated.
        """
        try:
            result = subprocess.run(
                ["huggingface-cli", "whoami"],
                capture_output=True,
                text=True,
                check=True,
            )
            return result.stdout.strip()
        except (subprocess.CalledProcessError, FileNotFoundError) as err:
            msg = "Please log in to HuggingFace first: huggingface-cli login"
            raise RuntimeError(msg) from err

    def upload_readme(self, output_repo: str, readme_path: Path) -> None:
        """Upload or update README file to repository.

        Creates repository if needed, handles existing repository updates.
        """
        logger.info("Uploading README...")
        try:
            subprocess.run(
                [
                    "huggingface-cli",
                    "upload",
                    output_repo,
                    str(readme_path),
                    "README.md",
                    "--create",
                ],
                check=True,
                capture_output=True,
            )
            logger.info("README uploaded")
        except subprocess.CalledProcessError:
            # Repository exists, update without --create
            subprocess.run(
                [
                    "huggingface-cli",
                    "upload",
                    output_repo,
                    str(readme_path),
                    "README.md",
                ],
                check=True,
            )
            logger.info("README updated")

    def upload_model_file(self, output_repo: str, model_path: Path) -> None:
        """Upload model file to repository.

        Uploads GGUF model file to specified repository path.
        """
        logger.info(f"Uploading {model_path.name}...")
        subprocess.run(
            [
                "huggingface-cli",
                "upload",
                output_repo,
                str(model_path),
                model_path.name,
            ],
            check=True,
        )
        logger.info(f"{model_path.name} uploaded")