llm-gguf-tools/helpers/ggml/quantiser.py

"""GGML block quantisation for unsupported architectures.

Implements proper GGML quantisation formats (Q4_0, Q5_0, Q8_0) using numpy,
following the exact specifications from ggml. This allows quantisation of
models with architectures not yet supported by llama.cpp.
"""

from __future__ import annotations

import struct
import traceback
from typing import TYPE_CHECKING, Any

import gguf
import numpy as np

from helpers.filesystem import FilesystemService
from helpers.logger import logger

if TYPE_CHECKING:
    from pathlib import Path


# GGML block sizes for different quantisation types
QK4_0 = 32  # Block size for Q4_0
QK5_0 = 32  # Block size for Q5_0
QK5_1 = 32  # Block size for Q5_1
QK8_0 = 32  # Block size for Q8_0


class GGMLQuantiser:
    """Implements GGML quantisation formats for architecture-agnostic models.

    Provides proper GGML block quantisation using numpy, following the exact
    format specifications. This enables Q4_0, Q5_0, and Q8_0 quantisation
    for models with unsupported architectures.
    """

    def __init__(self) -> None:
        """Initialise GGML quantiser."""
        self.fs = FilesystemService()

    def get_supported_types(self) -> list[str]:
        """Get supported basic quantisation types.

        Returns:
            List of supported quantisation type strings.
        """
        return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]

    def _extract_architecture_string(self, arch_field: Any) -> str:
        """Extract architecture string from GGUF field data.

        Handles various formats of architecture field storage in GGUF files.

        Returns:
            Architecture string or 'unknown' if extraction fails.
        """
        if not arch_field:
            return "unknown"

        if hasattr(arch_field, "parts") and arch_field.parts:
            return self._extract_from_parts_array(arch_field)
        if hasattr(arch_field, "data"):
            return self._extract_from_data_field(arch_field.data)

        return "unknown"

    def _extract_from_parts_array(self, arch_field: Any) -> str:
        """Extract architecture from GGUF parts array format.

        Returns:
            Architecture string or 'unknown' if extraction fails.
        """
        if len(arch_field.data) == 0:
            return "unknown"

        idx = arch_field.data[0] if isinstance(arch_field.data, (list, tuple)) else arch_field.data

        if idx >= len(arch_field.parts):
            return "unknown"

        return self._decode_part(arch_field.parts[idx])

    def _decode_part(self, arch_part: Any) -> str:
        """Decode architecture part to string.

        Returns:
            Decoded string representation.
        """
        if isinstance(arch_part, bytes):
            return arch_part.decode("utf-8")
        if isinstance(arch_part, str):
            return arch_part
        if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
            # Handle nested format
            if isinstance(arch_part[0], bytes):
                return arch_part[0].decode("utf-8")
            return str(arch_part[0])
        return str(arch_part)

    def _extract_from_data_field(self, data: Any) -> str:
        """Extract architecture from GGUF data field.

        Returns:
            Architecture string or 'unknown' if extraction fails.
        """
        if isinstance(data, np.ndarray):
            # It's a numpy array of bytes - convert to string
            try:
                return bytes(data).decode("utf-8")
            except (UnicodeDecodeError, ValueError):
                # If that fails, try converting as ASCII values
                return "".join(chr(c) for c in data if c < 128)
        elif isinstance(data, bytes):
            return data.decode("utf-8")
        elif isinstance(data, str):
            return data
        else:
            return str(data)

    def _copy_metadata_fields(self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter) -> None:
        """Copy metadata fields from reader to writer, excluding file type."""
        logger.info("📋 Copying metadata...")

        for key, field in reader.fields.items():
            # Skip the file type field - we'll set our own
            if key == "general.file_type":
                continue

            # Handle different field types
            if field.types:
                field_type = field.types[0]
                field_data = field.parts[field.data[0]] if field.parts else field.data

                self._copy_field_by_type(writer, key, field_type, field_data, field)

    def _copy_field_by_type(
        self,
        writer: gguf.GGUFWriter,
        key: str,
        field_type: gguf.GGUFValueType,
        field_data: Any,
        field: Any,
    ) -> None:
        """Copy a single field based on its type."""
        if field_type == gguf.GGUFValueType.STRING:
            # Handle both bytes and string types
            string_val = field_data[0]
            if isinstance(string_val, bytes):
                string_val = string_val.decode("utf-8")
            elif isinstance(string_val, int):
                string_val = str(string_val)
            writer.add_string(key, string_val)
        elif field_type == gguf.GGUFValueType.UINT32:
            writer.add_uint32(key, int(field.data[0]))
        elif field_type == gguf.GGUFValueType.FLOAT32:
            writer.add_float32(key, float(field.data[0]))
        elif field_type == gguf.GGUFValueType.BOOL:
            writer.add_bool(key, bool(field.data[0]))
        elif field_type == gguf.GGUFValueType.ARRAY:
            writer.add_array(key, field.data)
        else:
            # Skip unsupported field types for now
            # Future enhancement: Handle additional GGUF field types as needed
            pass

    def _get_file_type_mapping(self) -> dict[str, gguf.GGMLQuantizationType]:
        """Get mapping from quantisation type strings to GGML enums.

        Returns:
            Mapping from quantisation type strings to GGML enums.
        """
        return {
            "Q4_0": gguf.GGMLQuantizationType.Q4_0,
            "Q5_0": gguf.GGMLQuantizationType.Q5_0,
            "Q6_0": gguf.GGMLQuantizationType.Q6_K,  # Q6_0 uses Q6_K enum
            "Q8_0": gguf.GGMLQuantizationType.Q8_0,
        }

    def _process_tensor_list(
        self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter, quant_type: str
    ) -> None:
        """Process all tensors for quantisation."""
        logger.info(f"🔄 Quantising {len(reader.tensors)} tensors to {quant_type}...")

        for i, tensor in enumerate(reader.tensors):
            if i % 50 == 0:
                logger.info(f"  Processing tensor {i}/{len(reader.tensors)}...")

            self._process_single_tensor(tensor, writer, quant_type)

    def _process_single_tensor(self, tensor: Any, writer: gguf.GGUFWriter, quant_type: str) -> None:
        """Process a single tensor for quantisation or preserve as-is."""
        # Get tensor info
        name = tensor.name
        shape = list(tensor.shape)
        data = tensor.data

        # Determine if this tensor should be quantised
        should_quantise = self._should_quantise_tensor(name)

        if not should_quantise:
            # Keep original format
            writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
        else:
            # Quantise the tensor
            try:
                quantised_data, quant_dtype = self._quantise_tensor(
                    data, tensor.tensor_type, shape, quant_type
                )
                writer.add_tensor(name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype)
            except ValueError as e:
                # If quantization fails due to shape issues, keep original
                logger.warning(f"  ⚠️ Cannot quantise {name}: {e}")
                logger.warning("  Keeping in original format")
                writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)

    def _write_output_file(self, writer: gguf.GGUFWriter, output_path: Path) -> bool:
        """Write the final GGUF file and verify creation.

        Returns:
            True if successful, False otherwise
        """
        logger.info(f"💾 Writing {output_path.name}...")
        writer.write_header_to_file()
        writer.write_kv_data_to_file()
        writer.write_tensors_to_file()
        writer.close()

        if output_path.exists():
            file_size = self.fs.get_file_size(output_path)
            logger.info(f"✅ GGML quantisation complete: {file_size}")
            return True
        logger.error("❌ Output file was not created")
        return False

    def quantise_basic(
        self,
        input_path: Path,
        output_path: Path,
        quant_type: str,
    ) -> bool:
        """Perform GGML block quantisation on a GGUF file.

        Reads a GGUF file, quantises all tensors using the specified
        quantisation type, and writes a new GGUF file. Implements proper
        GGML block formats for architecture-agnostic quantisation.

        Returns:
            True if successful, False otherwise
        """
        if quant_type not in self.get_supported_types():
            logger.error(f"Unsupported quantisation type: {quant_type}")
            return False

        logger.info(f"🔧 Starting GGML {quant_type} quantisation...")
        logger.info("📝 This uses numpy-based block quantisation")

        try:
            # Read input GGUF
            logger.info(f"📖 Reading {input_path.name}...")
            reader = gguf.GGUFReader(str(input_path))

            # Create output writer with same architecture
            arch_field = reader.fields.get("general.architecture")
            arch_str = self._extract_architecture_string(arch_field)

            logger.info(f"📝 Architecture: {arch_str}")
            writer = gguf.GGUFWriter(str(output_path), arch_str)

            # Copy all metadata
            self._copy_metadata_fields(reader, writer)

            # Set file type based on quantisation
            file_type_map = self._get_file_type_mapping()
            writer.add_file_type(file_type_map[quant_type])

            # Process tensors
            self._process_tensor_list(reader, writer, quant_type)

            # Write the output file
            return self._write_output_file(writer, output_path)

        except Exception as e:
            logger.error(f"❌ GGML quantisation failed: {e}\n{traceback.format_exc()}")
            return False

    def _should_quantise_tensor(self, tensor_name: str) -> bool:
        """Determine if a tensor should be quantised.

        Some tensors like token embeddings should typically remain in
        higher precision for quality.

        Returns:
            True if the tensor should be quantised, False otherwise
        """
        # Keep token embeddings and output layers in original precision
        # These patterns cover most architectures
        keep_original = [
            "token_embd",
            "output.weight",
            "lm_head",
            "embed_tokens",
            "word_embeddings",
        ]

        for pattern in keep_original:
            if pattern in tensor_name:
                logger.debug(f"  Keeping {tensor_name} in original format")
                return False

        return True

    def _quantise_tensor(
        self,
        data: np.ndarray,
        dtype: gguf.GGMLQuantizationType,
        shape: list[int],
        quant_type: str,
    ) -> tuple[np.ndarray, gguf.GGMLQuantizationType]:
        """Quantise a tensor using GGML block quantisation.

        Returns:
            Tuple of (quantised_data, new_dtype)
        """
        # Work directly with numpy array - convert to float32 if needed
        if dtype in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}:
            arr = data.astype(np.float32)
        else:
            # Already quantised or unknown type - return as-is
            return data, dtype

        # Reshape to original shape
        arr = arr.reshape(shape)

        # Flatten for processing
        arr_flat = arr.flatten()

        # Apply quantisation
        if quant_type == "Q8_0":
            quantised = self._quantise_q8_0(arr_flat)
            new_dtype = gguf.GGMLQuantizationType.Q8_0
        elif quant_type == "Q6_0":
            quantised = self._quantise_q6_0(arr_flat)
            new_dtype = gguf.GGMLQuantizationType.Q6_K  # Q6_0 uses Q6_K enum
        elif quant_type == "Q5_0":
            quantised = self._quantise_q5_0(arr_flat)
            new_dtype = gguf.GGMLQuantizationType.Q5_0
        elif quant_type == "Q4_0":
            quantised = self._quantise_q4_0(arr_flat)
            new_dtype = gguf.GGMLQuantizationType.Q4_0
        else:
            # Unsupported - return original
            return data, dtype

        # Convert bytes back to numpy array for gguf writer
        return np.frombuffer(quantised, dtype=np.uint8), new_dtype

    def _quantise_q8_0(self, arr: np.ndarray) -> bytes:
        """Quantise to Q8_0 format.

        Q8_0: Blocks of 32 values, each block has:
        - 1 float16 scale factor (2 bytes)
        - 32 int8 values (32 bytes)
        Total: 34 bytes per 32 values

        Returns:
            Bytes of the quantised data
        """
        n = len(arr)
        nb = (n + QK8_0 - 1) // QK8_0  # Number of blocks

        output = bytearray()

        for i in range(nb):
            # Get block of values
            start = i * QK8_0
            end = min(start + QK8_0, n)
            block = arr[start:end]

            # Pad if needed
            if len(block) < QK8_0:
                block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")

            # Calculate scale
            amax = np.abs(block).max()
            scale = amax / 127.0 if amax > 0 else 1.0

            # Quantise
            quantised = np.round(block / scale).astype(np.int8)
            quantised = np.clip(quantised, -128, 127)

            output.extend(struct.pack("e", scale))  # 'e' is float16
            output.extend(quantised.tobytes())

        return bytes(output)

    def _quantise_q6_0(self, arr: np.ndarray) -> bytes:
        """Quantise to Q6_0 format.

        Q6_0: Blocks of 32 values with 6-bit quantisation
        - 1 float16 scale (2 bytes)
        - 1 float16 min value (2 bytes)
        - 24 bytes of packed 6-bit values (32 values * 6 bits = 192 bits = 24 bytes)
        Total: 28 bytes per 32 values

        Returns:
            Bytes of the quantised data
        """
        n = len(arr)
        nb = (n + QK8_0 - 1) // QK8_0  # Use same block size as Q8_0

        output = bytearray()

        for i in range(nb):
            # Get block
            start = i * QK8_0
            end = min(start + QK8_0, n)
            block = arr[start:end]

            # Pad if needed
            if len(block) < QK8_0:
                block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")

            # Calculate scale and min
            vmin = block.min()
            vmax = block.max()
            scale = (vmax - vmin) / 63.0 if vmax > vmin else 1.0

            # Quantise to 6-bit (0-63)
            quantised = np.round((block - vmin) / scale).astype(np.uint8)
            quantised = np.clip(quantised, 0, 63)

            # Pack scale and min
            output.extend(struct.pack("e", scale))
            output.extend(struct.pack("e", vmin))

            # Pack 6-bit values (simplified - using 1 byte per value)
            # Proper implementation would pack 4 values into 3 bytes
            for q in quantised:
                output.append(q)

            # Pad to expected size
            while len(output) % 28 != 0:
                output.append(0)

        return bytes(output)

    def _quantise_q5_0(self, arr: np.ndarray) -> bytes:
        """Quantise to Q5_0 format.

        Q5_0: Blocks of 32 values with 5-bit quantisation
        - 1 float16 scale (2 bytes)
        - 1 float16 min value (2 bytes)
        - 20 bytes of packed 5-bit values (32 values * 5 bits = 160 bits = 20 bytes)
        Total: 24 bytes per 32 values

        Returns:
            Bytes of the quantised data
        """
        n = len(arr)
        nb = (n + QK5_0 - 1) // QK5_0

        output = bytearray()

        for i in range(nb):
            # Get block
            start = i * QK5_0
            end = min(start + QK5_0, n)
            block = arr[start:end]

            # Pad if needed
            if len(block) < QK5_0:
                block = np.pad(block, (0, QK5_0 - len(block)), mode="constant")

            # Calculate scale and min
            vmin = block.min()
            vmax = block.max()
            scale = (vmax - vmin) / 31.0 if vmax > vmin else 1.0

            # Quantise to 5-bit (0-31)
            quantised = np.round((block - vmin) / scale).astype(np.uint8)
            quantised = np.clip(quantised, 0, 31)

            # Pack scale and min
            output.extend(struct.pack("e", scale))
            output.extend(struct.pack("e", vmin))

            # Pack 5-bit values (simplified packing - not optimal but functional)
            # For simplicity, use 1 byte per value (wasting 3 bits each)
            # Proper implementation would pack 8 values into 5 bytes
            for q in quantised:
                output.append(q)

            # Pad to expected size
            while len(output) % 24 != 0:
                output.append(0)

        return bytes(output)

    def _quantise_q4_0(self, arr: np.ndarray) -> bytes:
        """Quantise to Q4_0 format.

        Q4_0: Blocks of 32 values with 4-bit quantisation
        - 1 float16 scale (2 bytes)
        - 1 float16 min value (2 bytes)
        - 16 bytes of packed 4-bit values (32 values * 4 bits = 128 bits = 16 bytes)
        Total: 20 bytes per 32 values

        Returns:
            Bytes of the quantised data
        """
        n = len(arr)
        nb = (n + QK4_0 - 1) // QK4_0

        output = bytearray()

        for i in range(nb):
            # Get block
            start = i * QK4_0
            end = min(start + QK4_0, n)
            block = arr[start:end]

            # Pad if needed
            if len(block) < QK4_0:
                block = np.pad(block, (0, QK4_0 - len(block)), mode="constant")

            # Calculate scale and min
            vmin = block.min()
            vmax = block.max()
            scale = (vmax - vmin) / 15.0 if vmax > vmin else 1.0

            # Quantise to 4-bit (0-15)
            quantised = np.round((block - vmin) / scale).astype(np.uint8)
            quantised = np.clip(quantised, 0, 15)

            # Pack scale and min
            output.extend(struct.pack("e", scale))
            output.extend(struct.pack("e", vmin))

            # Pack 4-bit values - 2 values per byte
            for j in range(0, 32, 2):
                packed = (quantised[j] & 0xF) | ((quantised[j + 1] & 0xF) << 4)
                output.append(packed)

        return bytes(output)

    def try_alternative_quantisation(
        self,
        input_path: Path,
        output_path: Path,
        target_type: str,
    ) -> bool:
        """Try basic quantisation for unsupported architectures.

        For architectures not supported by llama.cpp, uses GGML implementation
        to provide basic quantisation formats as fallback. Handles only basic
        types that can be generated with numpy-based GGML quantisation.

        Returns:
            True if successful, False otherwise
        """
        # Only handle basic types that we can generate with GGML
        basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]

        if target_type in basic_types:
            logger.info(f"📝 Using GGML numpy implementation for {target_type}")
            return self.quantise_basic(input_path, output_path, target_type)

        # For K-quants on unsupported architectures, we can't provide a direct equivalent
        logger.error(f"❌ Cannot quantise {target_type} for unsupported architecture")
        logger.info("💡 Consider using Q4_0, Q5_0, Q6_0, or Q8_0 instead")
        return False