llm-gguf-tools/helpers/llama_cpp/architecture.py

"""Architecture detection and support checking.

Determines whether model architectures are supported by llama.cpp
and provides fallback strategies for unsupported architectures.
"""

from __future__ import annotations

import subprocess
from typing import TYPE_CHECKING

from helpers.logger import logger

if TYPE_CHECKING:
    from pathlib import Path


class ArchitectureDetector:
    """Detects and validates model architecture support.

    Checks whether model architectures are supported by llama.cpp
    for K-quant generation and determines appropriate quantisation
    strategies for unsupported architectures.
    """

    @staticmethod
    def check_architecture_support(f16_model_path: Path) -> bool:
        """Check if the model architecture is supported by llama.cpp.

        Tests the model's compatibility by attempting a quantisation with
        llama.cpp. Returns true if the architecture is unsupported, indicating
        that K-quants should be skipped.

        Returns:
            True if architecture is NOT supported (K-quants should be skipped)
        """
        try:
            # Try a simple quantization with llama.cpp to check support
            result = subprocess.run(
                [
                    ".cache/llm-gguf-tools/binaries/llama-quantize",
                    str(f16_model_path),
                    "/dev/null",
                    "Q4_K_M",
                ],
                check=False,
                capture_output=True,
                text=True,
                timeout=5,
            )

            # Check if it failed due to unknown architecture
            return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
        except Exception:
            # If we can't determine, assume it might work
            return False

    @staticmethod
    def get_supported_architectures() -> list[str]:
        """Get list of architectures known to be supported by llama.cpp.

        Returns:
            List of supported architecture names.
        """
        return [
            "llama",
            "llama2",
            "llama3",
            "mistral",
            "mixtral",
            "qwen",
            "qwen2",
            "gemma",
            "gemma2",
            "phi",
            "phi2",
            "phi3",
            "falcon",
            "gpt2",
            "gptj",
            "gptneox",
            "mpt",
            "starcoder",
            "starcoder2",
            "baichuan",
            "bert",
            "bloom",
            "deepseek",
            "deepseek2",
            "chatglm",
            "orion",
            "internlm2",
            "minicpm",
            "stablelm",
            "cohere",
            "dbrx",
            "olmo",
            "arctic",
            "rwkv",
        ]

    @staticmethod
    def map_architecture(model_type: str, arch_name: str) -> str:
        """Map model architecture to GGUF architecture string.

        Translates model type and architecture names from HuggingFace config
        to GGUF-compatible architecture identifiers. Handles special cases like
        "gpt-oss" to "gptoss" conversion and provides fallback mapping.

        Returns:
            GGUF architecture string to use.
        """
        # Direct mappings from model_type
        type_mappings = {
            "llama": "llama",
            "mistral": "llama",  # Mistral uses llama architecture
            "mixtral": "llama",
            "qwen": "qwen",
            "qwen2": "qwen2",
            "gemma": "gemma",
            "gemma2": "gemma2",
            "phi": "phi2",
            "phi3": "phi3",
            "phi-msft": "phi2",
            "falcon": "falcon",
            "gpt2": "gpt2",
            "gptj": "gptj",
            "gpt_neox": "gptneox",
            "gpt-oss": "gptoss",
            "mpt": "mpt",
            "starcoder": "starcoder",
            "starcoder2": "starcoder2",
            "baichuan": "baichuan",
            "bloom": "bloom",
            "chatglm": "chatglm",
            "deepseek": "llama",  # DeepSeek uses llama architecture
            "stablelm": "stablelm",
            "cohere": "cohere",
            "dbrx": "dbrx",
            "olmo": "olmo",
            "arctic": "arctic",
        }

        # Check model_type first
        if model_type in type_mappings:
            return type_mappings[model_type]

        # Architecture name mappings as fallback
        arch_mappings = {
            "LlamaForCausalLM": "llama",
            "MistralForCausalLM": "llama",
            "MixtralForCausalLM": "llama",
            "Qwen2ForCausalLM": "qwen2",
            "QwenForCausalLM": "qwen",
            "GemmaForCausalLM": "gemma",
            "Gemma2ForCausalLM": "gemma2",
            "GptOssForCausalLM": "gptoss",
            "PhiForCausalLM": "phi2",
            "Phi3ForCausalLM": "phi3",
            "FalconForCausalLM": "falcon",
            "GPT2LMHeadModel": "gpt2",
            "GPTJForCausalLM": "gptj",
            "GPTNeoXForCausalLM": "gptneox",
            "MPTForCausalLM": "mpt",
            "BloomForCausalLM": "bloom",
            "ChatGLMForCausalLM": "chatglm",
            "StableLmForCausalLM": "stablelm",
            "CohereForCausalLM": "cohere",
        }

        if arch_name in arch_mappings:
            return arch_mappings[arch_name]

        # Default fallback
        logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})")
        logger.warning("Defaulting to 'llama' architecture - may not work correctly")
        return "llama"

    @staticmethod
    def get_quantisation_support(architecture: str) -> dict[str, bool]:
        """Determine which quantisation types are supported for an architecture.

        Evaluates architecture compatibility with different quantisation methods.
        Basic quantisations are always supported via GGML, while K-quants and
        imatrix require specific llama.cpp support.

        Returns:
            Dictionary mapping quantisation type categories to support status.
        """
        # Known unsupported architectures for K-quants
        unsupported_kquants = [
            "bert",
            "dotsocr",  # Custom/unknown architectures
        ]

        is_supported = architecture not in unsupported_kquants

        return {
            "basic": True,  # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML
            "k_quants": is_supported,  # K-quants require llama.cpp support
            "imatrix": is_supported,  # imatrix requires llama.cpp support
        }

    @staticmethod
    def filter_quantisation_types(
        architecture: str,
        requested_types: list[str],
    ) -> tuple[list[str], list[str]]:
        """Filter quantisation types based on architecture support.

        Separates requested quantisation types into supported and unsupported
        based on the model's architecture capabilities. Basic types are always
        supported, while K-quants depend on architecture compatibility.

        Returns:
            Tuple of (supported_types, skipped_types).
        """
        support = ArchitectureDetector.get_quantisation_support(architecture)
        basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}

        supported = []
        skipped = []

        for quant_type in requested_types:
            if quant_type in basic_types:
                # Basic types always supported
                supported.append(quant_type)
            elif support["k_quants"]:
                # K-quants supported for this architecture
                supported.append(quant_type)
            else:
                # K-quants not supported
                skipped.append(quant_type)

        return supported, skipped