llm-gguf-tools/helpers/services/gguf.py

"""GGUF file operations service.

Provides unified interface for creating, writing, and manipulating GGUF files.
Consolidates GGUF-specific operations from conversion and quantisation workflows.
Uses UK English spelling conventions throughout.
"""

from __future__ import annotations

import gc
import json
import traceback
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol

import gguf
import torch
from safetensors import safe_open

from helpers.logger import logger
from helpers.services.filesystem import FilesystemService
from helpers.utils.config_parser import ConfigParser


class VisionConfig(Protocol):
    """Protocol for vision model configuration."""

    hidden_size: int
    num_hidden_layers: int
    num_attention_heads: int
    intermediate_size: int
    patch_size: int
    spatial_merge_size: int


class TensorMapper(Protocol):
    """Protocol for tensor name mapping."""

    def map_tensor_name(self, name: str) -> str | None:
        """Map a tensor name to its GGUF equivalent."""


if TYPE_CHECKING:
    import numpy as np

    from helpers.models.conversion import ModelConfig


class GGUFWriter:
    """Manages GGUF file creation and metadata writing.

    Provides high-level interface for GGUF file operations including metadata
    configuration, tensor addition, and tokeniser integration. Encapsulates
    low-level GGUF library interactions for consistent error handling.
    """

    def __init__(self, output_path: Path, architecture: str) -> None:
        """Initialise GGUF writer with output path and architecture.

        Creates the underlying GGUF writer instance and prepares for metadata
        and tensor addition. Sets up the file structure for the specified
        model architecture.
        """
        self.output_path = output_path
        self.architecture = architecture
        self.writer = gguf.GGUFWriter(str(output_path), architecture)
        logger.info(f"Created GGUF writer for {architecture} architecture")

    def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
        """Add comprehensive metadata from model configuration.

        Writes general model information, architectural parameters, and
        quantisation settings to the GGUF file header. Handles both standard
        and vision model configurations with appropriate parameter mapping.
        """
        # General metadata
        self.writer.add_name(model_name)
        self.writer.add_description(f"Converted from {model_config.architectures[0]}")
        self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)

        # Log architecture being used
        logger.info(f"Setting GGUF architecture: {self.architecture}")
        if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
            logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")

        # Model parameters from config
        params = model_config.to_gguf_params()
        self.writer.add_context_length(params.context_length)
        self.writer.add_embedding_length(params.embedding_length)
        self.writer.add_block_count(params.block_count)
        self.writer.add_feed_forward_length(params.feed_forward_length)
        self.writer.add_head_count(params.attention_head_count)
        self.writer.add_head_count_kv(params.attention_head_count_kv)
        self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
        self.writer.add_rope_freq_base(params.rope_freq_base)
        self.writer.add_rope_dimension_count(params.rope_dimension_count)

        logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")

    def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
        """Add vision model parameters to GGUF metadata.

        Configures vision-specific parameters for multimodal models including
        embedding dimensions, attention heads, and spatial processing settings.
        """
        if not vision_config:
            return

        logger.info("Adding vision model parameters...")
        self.writer.add_vision_embedding_length(vision_config.hidden_size)
        self.writer.add_vision_block_count(vision_config.num_hidden_layers)
        self.writer.add_vision_head_count(vision_config.num_attention_heads)
        self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
        self.writer.add_vision_patch_size(vision_config.patch_size)
        self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)

        if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
            self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)

    def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
        """Add tokeniser metadata to GGUF file.

        Writes special token IDs and tokeniser model type to enable proper
        text processing during inference. Uses sensible defaults for missing
        configuration values.
        """
        self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
        self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
        self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
        self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))

        # Add BOS/EOS token addition flags if available
        if "add_bos_token" in tokeniser_config:
            self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
        if "add_eos_token" in tokeniser_config:
            self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])

        # Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type

        logger.info("Added tokeniser configuration")

    def add_tokeniser_vocabulary(self, model_path: Path) -> None:
        """Add full tokeniser vocabulary to GGUF file.

        Loads and embeds the complete tokeniser vocabulary including tokens,
        merges, and scores to enable standalone model usage without external
        tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
        """
        tokenizer_path = model_path / "tokenizer.json"
        if not tokenizer_path.exists():
            logger.warning("tokenizer.json not found, skipping vocabulary embedding")
            return

        try:
            with Path(tokenizer_path).open(encoding="utf-8") as f:
                tokenizer_data = json.load(f)

            model_data = tokenizer_data.get("model", {})
            model_type = model_data.get("type", "")

            # Get pre-tokenizer information
            pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
            pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)

            # Get added tokens
            added_tokens = tokenizer_data.get("added_tokens", [])

            if model_type == "BPE":
                self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
            elif model_type == "Unigram":
                self._add_unigram_tokenizer(model_data, added_tokens)
            elif model_type == "WordPiece":
                self._add_wordpiece_tokenizer(model_data, added_tokens)
            else:
                logger.warning(f"Unsupported tokenizer type: {model_type}")
                # Try to add as generic tokenizer
                self._add_generic_tokenizer(model_data, tokenizer_data)

        except Exception as e:
            logger.error(f"Failed to load tokeniser vocabulary: {e}")
            logger.error(traceback.format_exc())

    def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
        """Determine pre-tokenizer type from configuration.

        Returns:
            Pre-tokenizer type.
        """
        if not pre_tokenizer:
            return "default"

        # Check for various pre-tokenizer types
        pre_type = pre_tokenizer.get("type", "")
        if "ByteLevel" in str(pre_type):
            return "llama3"
        if "Metaspace" in str(pre_type):
            return "default"

        return "default"

    def _add_bpe_tokenizer(
        self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str
    ) -> None:
        """Add BPE tokenizer vocabulary to GGUF."""
        vocab = model_data.get("vocab", {})
        merges = model_data.get("merges", [])

        if not vocab:
            logger.warning("No vocabulary found in BPE tokenizer")
            return

        # Create token list sorted by index
        max_idx = max(vocab.values()) if vocab else 0
        tokens = [""] * (max_idx + 1)

        for token, idx in vocab.items():
            if 0 <= idx < len(tokens):
                tokens[idx] = token

        # Handle added tokens
        for added_token in added_tokens:
            token_id = added_token.get("id")
            content = added_token.get("content")
            if token_id is not None and content is not None:
                if token_id >= len(tokens):
                    tokens.extend([""] * (token_id - len(tokens) + 1))
                tokens[token_id] = content

        # Prepare token types
        token_types = []
        for i, _token in enumerate(tokens):
            # Check if it's a special/control token
            is_special = any(
                added_token.get("id") == i and added_token.get("special", False)
                for added_token in added_tokens
            )
            if is_special:
                token_types.append(gguf.TokenType.CONTROL)
            else:
                token_types.append(gguf.TokenType.NORMAL)

        # Add to GGUF
        self.writer.add_tokenizer_model("gpt2")
        self.writer.add_tokenizer_pre(pre_type)
        self.writer.add_token_list(tokens)
        self.writer.add_token_scores([0.0] * len(tokens))
        self.writer.add_token_types(token_types)

        if merges:
            self.writer.add_token_merges(merges)
            logger.info(f"Added {len(merges)} BPE merges")

        logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)")

    def _add_unigram_tokenizer(
        self,
        model_data: dict[str, Any],
        added_tokens: list[dict[str, Any]],  # noqa: ARG002
    ) -> None:
        """Add Unigram/SentencePiece tokenizer to GGUF."""
        vocab = model_data.get("vocab", [])
        if not vocab:
            logger.warning("No vocabulary found in Unigram tokenizer")
            return

        tokens = []
        scores = []
        token_types = []

        # Process regular vocabulary
        for item in vocab:
            if isinstance(item, list) and len(item) >= 2:
                token = item[0]
                score = float(item[1]) if len(item) > 1 else 0.0
                tokens.append(token)
                scores.append(score)

                # Determine token type
                if token.startswith("<") and token.endswith(">"):
                    token_types.append(gguf.TokenType.CONTROL)
                elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
                    token_types.append(gguf.TokenType.BYTE)
                else:
                    token_types.append(gguf.TokenType.NORMAL)

        # Add to GGUF
        self.writer.add_tokenizer_model("llama")
        self.writer.add_tokenizer_pre("default")
        self.writer.add_token_list(tokens)
        self.writer.add_token_scores(scores)
        self.writer.add_token_types(token_types)

        logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)")

    def _add_wordpiece_tokenizer(
        self,
        model_data: dict[str, Any],
        added_tokens: list[dict[str, Any]],  # noqa: ARG002
    ) -> None:
        """Add WordPiece tokenizer to GGUF."""
        vocab = model_data.get("vocab", {})
        if not vocab:
            logger.warning("No vocabulary found in WordPiece tokenizer")
            return

        # Create token list sorted by index
        max_idx = max(vocab.values()) if vocab else 0
        tokens = [""] * (max_idx + 1)

        for token, idx in vocab.items():
            if 0 <= idx < len(tokens):
                tokens[idx] = token

        # Token types (all normal for WordPiece)
        token_types = [gguf.TokenType.NORMAL] * len(tokens)

        # Add to GGUF
        self.writer.add_tokenizer_model("bert")
        self.writer.add_tokenizer_pre("default")
        self.writer.add_token_list(tokens)
        self.writer.add_token_scores([0.0] * len(tokens))
        self.writer.add_token_types(token_types)

        logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)")

    def _add_generic_tokenizer(
        self,
        model_data: dict[str, Any],
        tokenizer_data: dict[str, Any],  # noqa: ARG002
    ) -> None:
        """Try to add a generic tokenizer based on available data."""
        vocab = model_data.get("vocab")
        if not vocab:
            logger.warning("Cannot extract vocabulary from unknown tokenizer type")
            return

        # Try to extract tokens in a generic way
        tokens = []
        if isinstance(vocab, dict):
            # Dictionary-style vocab
            max_idx = max(vocab.values()) if vocab else 0
            tokens = [""] * (max_idx + 1)
            for token, idx in vocab.items():
                if 0 <= idx < len(tokens):
                    tokens[idx] = token
        elif isinstance(vocab, list):
            # List-style vocab
            for item in vocab:
                if isinstance(item, str):
                    tokens.append(item)
                elif isinstance(item, list) and len(item) > 0:
                    tokens.append(item[0])

        if tokens:
            self.writer.add_tokenizer_model("llama")  # Default to llama
            self.writer.add_tokenizer_pre("default")
            self.writer.add_token_list(tokens)
            self.writer.add_token_scores([0.0] * len(tokens))
            self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens))
            logger.info(f"Added generic tokeniser ({len(tokens)} tokens)")
        else:
            logger.warning("Could not extract tokens from unknown tokenizer format")

    def add_tensor(self, name: str, data: np.ndarray) -> None:
        """Add a tensor to the GGUF file.

        Writes tensor data with the specified name to the file. Handles
        data type conversions and validates tensor shapes.
        """
        self.writer.add_tensor(name, data)

    def finalise(self) -> None:
        """Write all data to file and close writer.

        Completes the GGUF file creation by writing headers, key-value data,
        and tensor data in the correct order. Ensures proper file closure.
        """
        logger.info(f"Writing GGUF file to {self.output_path}")
        self.writer.write_header_to_file()
        self.writer.write_kv_data_to_file()
        self.writer.write_tensors_to_file()
        self.writer.close()
        logger.info("GGUF file written successfully")


class GGUFConverter:
    """High-level GGUF conversion orchestrator.

    Coordinates the complete conversion workflow from source models to GGUF
    format, managing metadata extraction, tensor mapping, and file writing.
    """

    @staticmethod
    def convert_safetensors(
        model_path: Path,
        output_path: Path,
        model_config: ModelConfig,
        architecture: str,
        tensor_mapper: TensorMapper,
    ) -> bool:
        """Convert SafeTensors model to GGUF format.

        Orchestrates the conversion process including metadata setup, tensor
        loading with BFloat16 support, name mapping, and tokeniser integration.

        Returns:
            True if conversion successful, False otherwise.
        """
        logger.info(f"Converting {model_path.name} to GGUF...")

        # Create writer
        writer_wrapper = GGUFWriter(output_path, architecture)

        # Add metadata
        writer_wrapper.add_metadata(model_config, model_path.name)

        # Add vision metadata if present
        if model_config.vision_config:
            writer_wrapper.add_vision_metadata(model_config.vision_config)

        # Load and add tensors
        fs = FilesystemService()
        tensor_files = fs.find_safetensor_files(model_path)
        logger.info(f"Found {len(tensor_files)} tensor file(s)")

        tensor_count = 0
        for tensor_file in tensor_files:
            logger.info(f"Loading {tensor_file.name}...")
            with safe_open(tensor_file, framework="pt") as f:
                for tensor_name in f.keys():  # noqa: SIM118
                    tensor_data = f.get_tensor(tensor_name)

                    # Convert BFloat16 to Float32
                    if hasattr(tensor_data, "numpy"):
                        if torch and tensor_data.dtype == torch.bfloat16:
                            tensor_data = tensor_data.float()
                        tensor_data = tensor_data.numpy()

                    # Map tensor name
                    gguf_name = tensor_mapper.map_tensor_name(tensor_name)

                    if gguf_name:
                        writer_wrapper.add_tensor(gguf_name, tensor_data)
                        tensor_count += 1

                        if tensor_count % 100 == 0:
                            logger.info(f"  Processed {tensor_count} tensors...")

                    # Free memory after processing each tensor
                    del tensor_data

            # Force garbage collection after processing each file
            gc.collect()

        logger.info(f"Total tensors processed: {tensor_count}")

        # Add tokeniser configuration
        try:
            tok_config = ConfigParser.load_tokeniser_config(model_path)
            writer_wrapper.add_tokeniser(tok_config)
            logger.info("Tokeniser configuration added")
        except Exception as e:
            logger.warning(f"Could not add tokeniser configuration: {e}")

        # Add tokeniser vocabulary (critical for standalone usage)
        try:
            writer_wrapper.add_tokeniser_vocabulary(model_path)
        except Exception as e:
            logger.error(f"Failed to embed tokeniser vocabulary: {e}")
            logger.error("Model will not work without external tokeniser files!")

        # Finalise file
        writer_wrapper.finalise()

        file_size = fs.get_file_size(output_path)
        logger.info(f"Conversion complete! Output: {output_path} ({file_size})")

        return True