235 lines
7.5 KiB
Python
235 lines
7.5 KiB
Python
"""Architecture detection and support checking.
|
|
|
|
Determines whether model architectures are supported by llama.cpp
|
|
and provides fallback strategies for unsupported architectures.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import subprocess
|
|
from typing import TYPE_CHECKING
|
|
|
|
from helpers.logger import logger
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
|
|
class ArchitectureDetector:
|
|
"""Detects and validates model architecture support.
|
|
|
|
Checks whether model architectures are supported by llama.cpp
|
|
for K-quant generation and determines appropriate quantisation
|
|
strategies for unsupported architectures.
|
|
"""
|
|
|
|
@staticmethod
|
|
def check_architecture_support(f16_model_path: Path) -> bool:
|
|
"""Check if the model architecture is supported by llama.cpp.
|
|
|
|
Tests the model's compatibility by attempting a quantisation with
|
|
llama.cpp. Returns true if the architecture is unsupported, indicating
|
|
that K-quants should be skipped.
|
|
|
|
Returns:
|
|
True if architecture is NOT supported (K-quants should be skipped)
|
|
"""
|
|
try:
|
|
# Try a simple quantization with llama.cpp to check support
|
|
result = subprocess.run(
|
|
[
|
|
".cache/llm-gguf-tools/binaries/llama-quantize",
|
|
str(f16_model_path),
|
|
"/dev/null",
|
|
"Q4_K_M",
|
|
],
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
|
|
# Check if it failed due to unknown architecture
|
|
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
|
|
except Exception:
|
|
# If we can't determine, assume it might work
|
|
return False
|
|
|
|
@staticmethod
|
|
def get_supported_architectures() -> list[str]:
|
|
"""Get list of architectures known to be supported by llama.cpp.
|
|
|
|
Returns:
|
|
List of supported architecture names.
|
|
"""
|
|
return [
|
|
"llama",
|
|
"llama2",
|
|
"llama3",
|
|
"mistral",
|
|
"mixtral",
|
|
"qwen",
|
|
"qwen2",
|
|
"gemma",
|
|
"gemma2",
|
|
"phi",
|
|
"phi2",
|
|
"phi3",
|
|
"falcon",
|
|
"gpt2",
|
|
"gptj",
|
|
"gptneox",
|
|
"mpt",
|
|
"starcoder",
|
|
"starcoder2",
|
|
"baichuan",
|
|
"bert",
|
|
"bloom",
|
|
"deepseek",
|
|
"deepseek2",
|
|
"chatglm",
|
|
"orion",
|
|
"internlm2",
|
|
"minicpm",
|
|
"stablelm",
|
|
"cohere",
|
|
"dbrx",
|
|
"olmo",
|
|
"arctic",
|
|
"rwkv",
|
|
]
|
|
|
|
@staticmethod
|
|
def map_architecture(model_type: str, arch_name: str) -> str:
|
|
"""Map model architecture to GGUF architecture string.
|
|
|
|
Translates model type and architecture names from HuggingFace config
|
|
to GGUF-compatible architecture identifiers. Handles special cases like
|
|
"gpt-oss" to "gptoss" conversion and provides fallback mapping.
|
|
|
|
Returns:
|
|
GGUF architecture string to use.
|
|
"""
|
|
# Direct mappings from model_type
|
|
type_mappings = {
|
|
"llama": "llama",
|
|
"mistral": "llama", # Mistral uses llama architecture
|
|
"mixtral": "llama",
|
|
"qwen": "qwen",
|
|
"qwen2": "qwen2",
|
|
"gemma": "gemma",
|
|
"gemma2": "gemma2",
|
|
"phi": "phi2",
|
|
"phi3": "phi3",
|
|
"phi-msft": "phi2",
|
|
"falcon": "falcon",
|
|
"gpt2": "gpt2",
|
|
"gptj": "gptj",
|
|
"gpt_neox": "gptneox",
|
|
"gpt-oss": "gptoss",
|
|
"mpt": "mpt",
|
|
"starcoder": "starcoder",
|
|
"starcoder2": "starcoder2",
|
|
"baichuan": "baichuan",
|
|
"bloom": "bloom",
|
|
"chatglm": "chatglm",
|
|
"deepseek": "llama", # DeepSeek uses llama architecture
|
|
"stablelm": "stablelm",
|
|
"cohere": "cohere",
|
|
"dbrx": "dbrx",
|
|
"olmo": "olmo",
|
|
"arctic": "arctic",
|
|
}
|
|
|
|
# Check model_type first
|
|
if model_type in type_mappings:
|
|
return type_mappings[model_type]
|
|
|
|
# Architecture name mappings as fallback
|
|
arch_mappings = {
|
|
"LlamaForCausalLM": "llama",
|
|
"MistralForCausalLM": "llama",
|
|
"MixtralForCausalLM": "llama",
|
|
"Qwen2ForCausalLM": "qwen2",
|
|
"QwenForCausalLM": "qwen",
|
|
"GemmaForCausalLM": "gemma",
|
|
"Gemma2ForCausalLM": "gemma2",
|
|
"GptOssForCausalLM": "gptoss",
|
|
"PhiForCausalLM": "phi2",
|
|
"Phi3ForCausalLM": "phi3",
|
|
"FalconForCausalLM": "falcon",
|
|
"GPT2LMHeadModel": "gpt2",
|
|
"GPTJForCausalLM": "gptj",
|
|
"GPTNeoXForCausalLM": "gptneox",
|
|
"MPTForCausalLM": "mpt",
|
|
"BloomForCausalLM": "bloom",
|
|
"ChatGLMForCausalLM": "chatglm",
|
|
"StableLmForCausalLM": "stablelm",
|
|
"CohereForCausalLM": "cohere",
|
|
}
|
|
|
|
if arch_name in arch_mappings:
|
|
return arch_mappings[arch_name]
|
|
|
|
# Default fallback
|
|
logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})")
|
|
logger.warning("Defaulting to 'llama' architecture - may not work correctly")
|
|
return "llama"
|
|
|
|
@staticmethod
|
|
def get_quantisation_support(architecture: str) -> dict[str, bool]:
|
|
"""Determine which quantisation types are supported for an architecture.
|
|
|
|
Evaluates architecture compatibility with different quantisation methods.
|
|
Basic quantisations are always supported via GGML, while K-quants and
|
|
imatrix require specific llama.cpp support.
|
|
|
|
Returns:
|
|
Dictionary mapping quantisation type categories to support status.
|
|
"""
|
|
# Known unsupported architectures for K-quants
|
|
unsupported_kquants = [
|
|
"bert",
|
|
"dotsocr", # Custom/unknown architectures
|
|
]
|
|
|
|
is_supported = architecture not in unsupported_kquants
|
|
|
|
return {
|
|
"basic": True, # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML
|
|
"k_quants": is_supported, # K-quants require llama.cpp support
|
|
"imatrix": is_supported, # imatrix requires llama.cpp support
|
|
}
|
|
|
|
@staticmethod
|
|
def filter_quantisation_types(
|
|
architecture: str,
|
|
requested_types: list[str],
|
|
) -> tuple[list[str], list[str]]:
|
|
"""Filter quantisation types based on architecture support.
|
|
|
|
Separates requested quantisation types into supported and unsupported
|
|
based on the model's architecture capabilities. Basic types are always
|
|
supported, while K-quants depend on architecture compatibility.
|
|
|
|
Returns:
|
|
Tuple of (supported_types, skipped_types).
|
|
"""
|
|
support = ArchitectureDetector.get_quantisation_support(architecture)
|
|
basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}
|
|
|
|
supported = []
|
|
skipped = []
|
|
|
|
for quant_type in requested_types:
|
|
if quant_type in basic_types:
|
|
# Basic types always supported
|
|
supported.append(quant_type)
|
|
elif support["k_quants"]:
|
|
# K-quants supported for this architecture
|
|
supported.append(quant_type)
|
|
else:
|
|
# K-quants not supported
|
|
skipped.append(quant_type)
|
|
|
|
return supported, skipped
|