llm-gguf-tools/helpers/llama_cpp/architecture.py
2025-08-09 17:16:02 +01:00

235 lines
7.5 KiB
Python

"""Architecture detection and support checking.
Determines whether model architectures are supported by llama.cpp
and provides fallback strategies for unsupported architectures.
"""
from __future__ import annotations
import subprocess
from typing import TYPE_CHECKING
from helpers.logger import logger
if TYPE_CHECKING:
from pathlib import Path
class ArchitectureDetector:
"""Detects and validates model architecture support.
Checks whether model architectures are supported by llama.cpp
for K-quant generation and determines appropriate quantisation
strategies for unsupported architectures.
"""
@staticmethod
def check_architecture_support(f16_model_path: Path) -> bool:
"""Check if the model architecture is supported by llama.cpp.
Tests the model's compatibility by attempting a quantisation with
llama.cpp. Returns true if the architecture is unsupported, indicating
that K-quants should be skipped.
Returns:
True if architecture is NOT supported (K-quants should be skipped)
"""
try:
# Try a simple quantization with llama.cpp to check support
result = subprocess.run(
[
".cache/llm-gguf-tools/binaries/llama-quantize",
str(f16_model_path),
"/dev/null",
"Q4_K_M",
],
check=False,
capture_output=True,
text=True,
timeout=5,
)
# Check if it failed due to unknown architecture
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
except Exception:
# If we can't determine, assume it might work
return False
@staticmethod
def get_supported_architectures() -> list[str]:
"""Get list of architectures known to be supported by llama.cpp.
Returns:
List of supported architecture names.
"""
return [
"llama",
"llama2",
"llama3",
"mistral",
"mixtral",
"qwen",
"qwen2",
"gemma",
"gemma2",
"phi",
"phi2",
"phi3",
"falcon",
"gpt2",
"gptj",
"gptneox",
"mpt",
"starcoder",
"starcoder2",
"baichuan",
"bert",
"bloom",
"deepseek",
"deepseek2",
"chatglm",
"orion",
"internlm2",
"minicpm",
"stablelm",
"cohere",
"dbrx",
"olmo",
"arctic",
"rwkv",
]
@staticmethod
def map_architecture(model_type: str, arch_name: str) -> str:
"""Map model architecture to GGUF architecture string.
Translates model type and architecture names from HuggingFace config
to GGUF-compatible architecture identifiers. Handles special cases like
"gpt-oss" to "gptoss" conversion and provides fallback mapping.
Returns:
GGUF architecture string to use.
"""
# Direct mappings from model_type
type_mappings = {
"llama": "llama",
"mistral": "llama", # Mistral uses llama architecture
"mixtral": "llama",
"qwen": "qwen",
"qwen2": "qwen2",
"gemma": "gemma",
"gemma2": "gemma2",
"phi": "phi2",
"phi3": "phi3",
"phi-msft": "phi2",
"falcon": "falcon",
"gpt2": "gpt2",
"gptj": "gptj",
"gpt_neox": "gptneox",
"gpt-oss": "gptoss",
"mpt": "mpt",
"starcoder": "starcoder",
"starcoder2": "starcoder2",
"baichuan": "baichuan",
"bloom": "bloom",
"chatglm": "chatglm",
"deepseek": "llama", # DeepSeek uses llama architecture
"stablelm": "stablelm",
"cohere": "cohere",
"dbrx": "dbrx",
"olmo": "olmo",
"arctic": "arctic",
}
# Check model_type first
if model_type in type_mappings:
return type_mappings[model_type]
# Architecture name mappings as fallback
arch_mappings = {
"LlamaForCausalLM": "llama",
"MistralForCausalLM": "llama",
"MixtralForCausalLM": "llama",
"Qwen2ForCausalLM": "qwen2",
"QwenForCausalLM": "qwen",
"GemmaForCausalLM": "gemma",
"Gemma2ForCausalLM": "gemma2",
"GptOssForCausalLM": "gptoss",
"PhiForCausalLM": "phi2",
"Phi3ForCausalLM": "phi3",
"FalconForCausalLM": "falcon",
"GPT2LMHeadModel": "gpt2",
"GPTJForCausalLM": "gptj",
"GPTNeoXForCausalLM": "gptneox",
"MPTForCausalLM": "mpt",
"BloomForCausalLM": "bloom",
"ChatGLMForCausalLM": "chatglm",
"StableLmForCausalLM": "stablelm",
"CohereForCausalLM": "cohere",
}
if arch_name in arch_mappings:
return arch_mappings[arch_name]
# Default fallback
logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})")
logger.warning("Defaulting to 'llama' architecture - may not work correctly")
return "llama"
@staticmethod
def get_quantisation_support(architecture: str) -> dict[str, bool]:
"""Determine which quantisation types are supported for an architecture.
Evaluates architecture compatibility with different quantisation methods.
Basic quantisations are always supported via GGML, while K-quants and
imatrix require specific llama.cpp support.
Returns:
Dictionary mapping quantisation type categories to support status.
"""
# Known unsupported architectures for K-quants
unsupported_kquants = [
"bert",
"dotsocr", # Custom/unknown architectures
]
is_supported = architecture not in unsupported_kquants
return {
"basic": True, # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML
"k_quants": is_supported, # K-quants require llama.cpp support
"imatrix": is_supported, # imatrix requires llama.cpp support
}
@staticmethod
def filter_quantisation_types(
architecture: str,
requested_types: list[str],
) -> tuple[list[str], list[str]]:
"""Filter quantisation types based on architecture support.
Separates requested quantisation types into supported and unsupported
based on the model's architecture capabilities. Basic types are always
supported, while K-quants depend on architecture compatibility.
Returns:
Tuple of (supported_types, skipped_types).
"""
support = ArchitectureDetector.get_quantisation_support(architecture)
basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}
supported = []
skipped = []
for quant_type in requested_types:
if quant_type in basic_types:
# Basic types always supported
supported.append(quant_type)
elif support["k_quants"]:
# K-quants supported for this architecture
supported.append(quant_type)
else:
# K-quants not supported
skipped.append(quant_type)
return supported, skipped