"""Python API wrapper for llama-cpp-python quantisation operations. Provides high-level Python interfaces for model quantisation using llama-cpp-python bindings. Implements partial tensor-specific quantisation support through embedding and output tensor type configuration. """ from __future__ import annotations import ctypes import gc import logging import os import signal import sys import traceback from typing import TYPE_CHECKING, Any, ClassVar, Never import psutil from helpers.logger import logger from helpers.services.gguf import GGUFConverter from helpers.utils.config_parser import ConfigParser from helpers.utils.tensor_mapping import TensorMapper if TYPE_CHECKING: from pathlib import Path from helpers.models.quantisation import QuantisationConfig # Import llama_cpp when needed try: import llama_cpp from llama_cpp import llama_model_quantize_params LLAMA_CPP_AVAILABLE = True except ImportError: LLAMA_CPP_AVAILABLE = False logger.warning("llama-cpp-python not available - falling back to binary mode") class LlamaCppPythonAPI: """Python API wrapper for llama.cpp quantisation operations. Provides direct Python access to quantisation functionality using llama-cpp-python bindings. Implements partial tensor-specific quantisation through token embedding and output tensor type configuration, which provides differentiation between Q4_K variants even without full per-layer tensor control. """ # Mapping of custom variant prefixes to their base types VARIANT_BASE_MAPPING: ClassVar[dict[str, str]] = { "Q3_K_": "Q3_K_M", "Q4_K_": "Q4_K_M", "Q5_K_": "Q5_K_M", "Q6_K_": "Q6_K", } @staticmethod def is_available() -> bool: """Check if llama-cpp-python is available for use. Returns: True if llama-cpp-python bindings are installed and functional. """ return LLAMA_CPP_AVAILABLE @staticmethod def get_quantisation_type(config_name: str) -> int: """Map configuration name to llama_cpp quantisation type constant. Supports a wide range of quantisation types from Q2 to Q8, including K-quants and legacy formats. Handles both simple formats (Q4_K_M, Q6_K) and custom suffixed variants (Q4_K_M_L, Q5_K_M_XL) by mapping them to their base types for llama-cpp-python compatibility. Returns: llama_cpp quantisation type constant for base quantisation. Raises: RuntimeError: If llama-cpp-python is not available. ValueError: If the quantisation type is not supported. """ if not LLAMA_CPP_AVAILABLE: msg = "llama-cpp-python not available" raise RuntimeError(msg) # Normalise the config name to extract base type # E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K) # E.g., "Q4_K_M_XXL" -> "Q4_K_M" config_upper = config_name.upper() # Direct mapping for exact matches type_mapping = { # Q2 variants (not recommended but supported) "Q2_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K, "Q2_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K_S, # Q3 K-quants "Q3_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_S, "Q3_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_M, # Q4 K-quants (most common) "Q4_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_S, "Q4_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_M, # Q5 K-quants "Q5_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_S, "Q5_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_M, # Q6_K (single variant) "Q6_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q6_K, # Q8_0 (highest common quantisation) "Q8_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q8_0, # Legacy quantisation formats "Q4_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_0, "Q4_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_1, "Q5_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_0, "Q5_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_1, # IQ (Integer Quantisation) variants - experimental "IQ2_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XXS, "IQ2_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XS, "IQ2_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_S, "IQ2_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_M, "IQ3_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XXS, "IQ3_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XS, "IQ3_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_S, "IQ3_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_M, "IQ4_NL": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_NL, "IQ4_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_XS, # Higher precision formats "F16": llama_cpp.LLAMA_FTYPE_MOSTLY_F16, "BF16": llama_cpp.LLAMA_FTYPE_MOSTLY_BF16, } # Try direct lookup first if config_upper in type_mapping: return type_mapping[config_upper] # Handle custom variants using base mapping for prefix, base_type in LlamaCppPythonAPI.VARIANT_BASE_MAPPING.items(): if config_upper.startswith(prefix) and config_upper not in type_mapping: return type_mapping[base_type] # If not found, raise an informative error supported = sorted(type_mapping.keys()) msg = ( f"Unsupported quantisation type: {config_name}\n" f"Supported types: {', '.join(supported)}\n" f"Custom variants like Q4_K_L, Q4_K_XL are also supported." ) raise ValueError(msg) @staticmethod def get_tensor_type_value(type_name: str) -> int: """Convert tensor type name to llama_cpp constant. Maps string tensor type names to their corresponding llama_cpp integer constants for tensor-specific overrides. Provides the foundation for differentiated quantisation strategies across embedding and output layers. Returns: Integer value for the tensor type, or 0 if not found. """ if not LLAMA_CPP_AVAILABLE: return 0 # Build mapping with variant consolidation # All Q3_K variants map to base Q3_K type, same for Q4_K and Q5_K type_mapping = LlamaCppPythonAPI._build_tensor_type_mapping() return type_mapping.get(type_name.upper(), 0) @staticmethod def _build_tensor_type_mapping() -> dict[str, int]: """Build tensor type mapping with variant consolidation. Returns: Dictionary mapping type names to GGML constants. """ if not LLAMA_CPP_AVAILABLE: return {} # Base mappings return { # Q2 variants "Q2_K": llama_cpp.GGML_TYPE_Q2_K, # Q3 variants - all map to base Q3_K "Q3_K": llama_cpp.GGML_TYPE_Q3_K, "Q3_K_S": llama_cpp.GGML_TYPE_Q3_K, "Q3_K_M": llama_cpp.GGML_TYPE_Q3_K, "Q3_K_L": llama_cpp.GGML_TYPE_Q3_K, # Q4 variants "Q4_0": llama_cpp.GGML_TYPE_Q4_0, "Q4_1": llama_cpp.GGML_TYPE_Q4_1, "Q4_K": llama_cpp.GGML_TYPE_Q4_K, "Q4_K_S": llama_cpp.GGML_TYPE_Q4_K, "Q4_K_M": llama_cpp.GGML_TYPE_Q4_K, # Q5 variants "Q5_0": llama_cpp.GGML_TYPE_Q5_0, "Q5_1": llama_cpp.GGML_TYPE_Q5_1, "Q5_K": llama_cpp.GGML_TYPE_Q5_K, "Q5_K_S": llama_cpp.GGML_TYPE_Q5_K, "Q5_K_M": llama_cpp.GGML_TYPE_Q5_K, # Q6 variant "Q6_K": llama_cpp.GGML_TYPE_Q6_K, # Q8 variant "Q8_0": llama_cpp.GGML_TYPE_Q8_0, # Higher precision "F16": llama_cpp.GGML_TYPE_F16, "F32": llama_cpp.GGML_TYPE_F32, } def quantise_model_flexible( self, input_path: Path, output_path: Path, base_type: str, embedding_type: str | None = None, output_type: str | None = None, imatrix_path: Path | None = None, ) -> bool: """Quantise model with flexible tensor type configuration. Provides control over base quantisation type with optional overrides for embeddings and output layers, which are the only tensor-specific controls that work reliably with llama-cpp-python. Args: input_path: Path to input GGUF model. output_path: Path for output quantised model. base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K"). embedding_type: Override for token embeddings (None = use base). output_type: Override for output/lm_head layers (None = use base). imatrix_path: Optional importance matrix file. Returns: True if quantisation successful, False otherwise. Examples: # Q4_K_L: Q4_K_M base with Q8_0 embeddings api.quantise_model_flexible( input_path, output_path, "Q4_K_M", embedding_type="Q8_0" ) # Q3_K_L: Q3_K_M base with Q5_K output api.quantise_model_flexible( input_path, output_path, "Q3_K_M", output_type="Q5_K" ) # Q3_K_XL: Q3_K_M with both Q8_0 embeddings and Q5_K output api.quantise_model_flexible( input_path, output_path, "Q3_K_M", embedding_type="Q8_0", output_type="Q5_K" ) Raises: RuntimeError: If llama-cpp-python is not available. """ if not LLAMA_CPP_AVAILABLE: msg = "llama-cpp-python not available for quantisation" raise RuntimeError(msg) logger.info(f"🔄 Flexible quantisation: {base_type} base") logger.info(f"📝 Input: {input_path}") logger.info(f"📝 Output: {output_path}") # Setup phase - create and configure parameters params = self._create_params(base_type, imatrix_path) self._apply_tensor_overrides(params, embedding_type, output_type) # Execution phase - perform quantisation try: logger.debug("DEBUG: Starting flexible quantisation execution") result = self._do_quantisation(input_path, output_path, params) logger.debug(f"DEBUG: Flexible quantisation returned: {result}") except Exception as e: logger.error(f"❌ Flexible quantisation failed with exception: {e}") logger.error("Flexible quantisation traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") return False else: if result == 0: # Verify output file was created and is valid if not output_path.exists(): logger.error( f"❌ Quantisation claimed success but output does not exist: {output_path}" ) return False try: output_size = output_path.stat().st_size logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB") if output_size == 0: logger.error("❌ Output file is empty despite success code") return False except Exception as e: logger.warning(f"⚠️ Could not check output file size: {e}") logger.info(f"✅ Quantisation successful: {output_path.name}") return True logger.error(f"❌ Quantisation failed with code: {result}") return False def _create_params( self, base_type: str, imatrix_path: Path | None ) -> llama_model_quantize_params: """Create quantisation parameters. Returns: Configured quantisation parameters. """ params = llama_model_quantize_params() params.ftype = self.get_quantisation_type(base_type) params.nthread = 8 params.allow_requantize = True if imatrix_path and imatrix_path.exists(): # Convert path to bytes and create c_char_p, then cast to c_void_p imatrix_bytes = str(imatrix_path).encode("utf-8") char_p = ctypes.c_char_p(imatrix_bytes) params.imatrix = ctypes.cast(char_p, ctypes.c_void_p) logger.info(f"🧮 Using imatrix: {imatrix_path.name}") return params def _apply_tensor_overrides( self, params: llama_model_quantize_params, embedding_type: str | None, output_type: str | None, ) -> None: """Apply embedding and output tensor type overrides to params. These are the only tensor-specific controls that work reliably with llama-cpp-python. """ # Apply embedding override if specified if embedding_type: params.token_embedding_type = self.get_tensor_type_value(embedding_type) logger.info(f"⚙️ Token embedding type: {embedding_type}") # Apply output override if specified if output_type: params.output_tensor_type = self.get_tensor_type_value(output_type) params.quantize_output_tensor = True logger.info(f"⚙️ Output tensor type: {output_type}") def _do_quantisation( self, input_path: Path, output_path: Path, params: llama_model_quantize_params, ) -> int: """Perform the quantisation operation. Returns: Return code (0 for success). Raises: KeyboardInterrupt: If the user interrupts the quantisation process. SystemExit: If the system exits during quantisation. """ logger.debug("DEBUG: Calling llama_cpp.llama_model_quantize") try: # Flush any pending output before calling C library sys.stdout.flush() sys.stderr.flush() # Temporarily redirect stderr to prevent terminal control issues # Some GGUF models output control sequences that can break the terminal old_stderr_fd = None devnull_fd = None try: # Only redirect if not in debug mode to preserve error messages if not logger.isEnabledFor(logging.DEBUG): old_stderr_fd = os.dup(2) # Save current stderr devnull_fd = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull_fd, 2) # Redirect stderr to /dev/null # Call the quantization with proper exception handling result = llama_cpp.llama_model_quantize( str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params ) finally: # Restore stderr if we redirected it if old_stderr_fd is not None: os.dup2(old_stderr_fd, 2) os.close(old_stderr_fd) if devnull_fd is not None: os.close(devnull_fd) # Flush output after the call sys.stdout.flush() sys.stderr.flush() except KeyboardInterrupt: logger.error("❌ Quantisation interrupted by user") raise except SystemExit as e: logger.error(f"❌ System exit during quantisation: {e}") raise except Exception as e: logger.error(f"❌ llama_model_quantize call failed: {e}") logger.error("llama_model_quantize call traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") raise else: logger.debug(f"DEBUG: llama_model_quantize completed with code: {result}") return result def quantise_model( self, input_path: Path, output_path: Path, config: QuantisationConfig, imatrix_path: Path | None = None, ) -> bool: """Quantise model using Python API. Performs quantisation using llama-cpp-python's direct API access with support for embedding and output tensor type overrides. The L and XL variants use a base type with specific overrides. Returns: True if quantisation successful, False otherwise. Raises: RuntimeError: If llama-cpp-python is not available. """ if not LLAMA_CPP_AVAILABLE: msg = "llama-cpp-python not available for quantisation" raise RuntimeError(msg) # Force cleanup before starting gc.collect() # Log initial resource state mem_before = self._log_resource_state("before") try: # Validate input if not self._validate_input_file(input_path): return False # Setup parameters params = self._setup_quantisation_params(config, imatrix_path) if params is None: return False # Execute quantisation result = self._execute_quantisation(input_path, output_path, params) # Verify and finalize if result == 0: return self._finalize_successful_quantisation(output_path, mem_before) logger.error(f"❌ Quantisation failed with code: {result}") except Exception as e: logger.error(f"❌ Quantisation failed with exception: {e}") logger.error("Full quantisation traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") # Garbage collect and return false gc.collect() return False def _log_resource_state(self, phase: str) -> float: """Log current resource usage state. Args: phase: Description of current phase (e.g., "before", "after"). Returns: Current memory usage in GB. """ process = psutil.Process() memory_gb = process.memory_info().rss / (1024**3) logger.debug(f"DEBUG: Memory {phase} quantisation: {memory_gb:.2f} GB") logger.debug(f"DEBUG: Open file descriptors: {len(process.open_files())}") if phase == "before": logger.debug(f"DEBUG: Process PID: {process.pid}") return memory_gb def _validate_input_file(self, input_path: Path) -> bool: """Validate input file exists and is readable. Args: input_path: Path to input file. Returns: True if file is valid, False otherwise. """ logger.debug(f"DEBUG: Starting quantisation of {input_path.name}") logger.info(f"🔄 Quantising {input_path.name}...") logger.debug(f"DEBUG: Input: {input_path}") if not input_path.exists(): logger.error(f"❌ Input file does not exist: {input_path}") return False if not input_path.is_file(): logger.error(f"❌ Input path is not a file: {input_path}") return False try: input_size = input_path.stat().st_size logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB") if input_size == 0: logger.error("❌ Input file is empty") return False except Exception as e: logger.warning(f"⚠️ Could not check input file size: {e}") return True def _setup_quantisation_params( self, config: QuantisationConfig, imatrix_path: Path | None, ) -> llama_model_quantize_params | None: """Setup quantisation parameters. Args: config: Quantisation configuration. imatrix_path: Optional path to importance matrix. Returns: Configured parameters or None if setup failed. """ logger.debug("DEBUG: Setting up quantisation parameters") params = llama_model_quantize_params() # Set base quantisation type try: params.ftype = self.get_quantisation_type(config.base_type) logger.debug( f"DEBUG: Set ftype to {params.ftype} for {config.base_type} (config: {config.name})" ) except Exception as e: logger.error(f"❌ Failed to get quantisation type for {config.name}: {e}") return None # Configure basic parameters params.nthread = 8 params.allow_requantize = True logger.debug( f"DEBUG: Set nthread={params.nthread}, allow_requantize={params.allow_requantize}" ) # Add imatrix if available if imatrix_path and imatrix_path.exists(): try: # Convert path to bytes and create c_char_p, then cast to c_void_p imatrix_bytes = str(imatrix_path).encode("utf-8") char_p = ctypes.c_char_p(imatrix_bytes) params.imatrix = ctypes.cast(char_p, ctypes.c_void_p) logger.info(f"🧮 Using imatrix: {imatrix_path.name}") logger.debug(f"DEBUG: imatrix path set: {imatrix_path}") except Exception as e: logger.error(f"❌ Failed to set imatrix: {e}") # Continue without imatrix # Configure tensor-specific types logger.debug("DEBUG: Configuring tensor-specific types") try: self._configure_tensor_types(params, config) logger.debug("DEBUG: Tensor types configured successfully") except Exception as e: logger.error(f"❌ Failed to configure tensor types: {e}") logger.error("Tensor type configuration traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") # Continue with default types return params def _execute_quantisation( self, input_path: Path, output_path: Path, params: llama_model_quantize_params, ) -> int: """Execute the actual quantisation with signal handling. Args: input_path: Path to input model. output_path: Path for output model. params: Configured quantisation parameters. Returns: Return code from quantisation (0 for success). """ logger.debug("DEBUG: Starting llama_cpp.llama_model_quantize call") logger.debug("DEBUG: About to call llama_model_quantize...") # Setup signal handlers old_handlers = self._setup_signal_handlers() try: result = llama_cpp.llama_model_quantize( str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params ) logger.debug(f"DEBUG: llama_model_quantize returned: {result}") except Exception as e: logger.error(f"❌ llama_model_quantize raised exception: {e}") logger.error("llama_model_quantize traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") return -1 else: return result finally: self._restore_signal_handlers(old_handlers) def _setup_signal_handlers(self) -> tuple[Any, Any | None]: """Setup signal handlers for debugging termination. Returns: Tuple of (old_sigterm, old_sigsegv) handlers. """ def signal_debug_handler(signum: int, frame: object) -> Never: # noqa: ARG001 logger.error(f"DEBUG: Received signal {signum} during quantisation!") logger.error(f"DEBUG: Signal name: {signal.Signals(signum).name}") msg = f"Signal {signum} received" raise KeyboardInterrupt(msg) old_sigterm = signal.signal(signal.SIGTERM, signal_debug_handler) old_sigsegv = ( signal.signal(signal.SIGSEGV, signal_debug_handler) if hasattr(signal, "SIGSEGV") else None ) return old_sigterm, old_sigsegv def _restore_signal_handlers(self, handlers: tuple[Any, Any | None]) -> None: """Restore original signal handlers. Args: handlers: Tuple of (old_sigterm, old_sigsegv) handlers. """ old_sigterm, old_sigsegv = handlers signal.signal(signal.SIGTERM, old_sigterm) if old_sigsegv is not None: signal.signal(signal.SIGSEGV, old_sigsegv) def _finalize_successful_quantisation( self, output_path: Path, mem_before: float, ) -> bool: """Finalize successful quantisation and verify output. Args: output_path: Path to output file. mem_before: Memory usage before quantisation in GB. Returns: True if output is valid, False otherwise. """ logger.debug("DEBUG: Quantisation returned success code") # Verify output exists if not output_path.exists(): logger.error( f"❌ Quantisation claimed success but output does not exist: {output_path}" ) return False # Verify output size output_size = output_path.stat().st_size logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB") if output_size == 0: logger.error("❌ Output file is empty despite success code") return False logger.info(f"✅ Quantisation successful: {output_path.name}") # Force cleanup and log final state gc.collect() mem_after = self._log_resource_state("after") logger.debug(f"DEBUG: Memory delta: {mem_after - mem_before:+.2f} GB") return True def _configure_tensor_types( self, params: llama_model_quantize_params, config: QuantisationConfig ) -> None: """Configure tensor-specific quantisation types. Sets embedding and output tensor type overrides based on config. These are the only tensor-specific controls that work reliably with llama-cpp-python. """ logger.debug(f"DEBUG: _configure_tensor_types called for {config.name}") # Apply embedding override if specified if config.embedding_type: params.token_embedding_type = self.get_tensor_type_value(config.embedding_type) logger.info(f"⚙️ Token embedding type: {config.embedding_type}") # Apply output override if specified if config.output_type: params.output_tensor_type = self.get_tensor_type_value(config.output_type) params.quantize_output_tensor = True logger.info(f"⚙️ Output tensor type: {config.output_type}") def convert_hf_to_gguf( self, input_dir: Path, output_path: Path, output_type: str = "f16" ) -> bool: """Convert HuggingFace model to GGUF format using native Python converter. Uses our GGUFConverter for SafeTensors models, providing full Python-based conversion without external dependencies. Returns: True if conversion successful, False otherwise. """ logger.info(f"🔄 Converting {input_dir.name} to GGUF format...") logger.info(f"📝 Input: {input_dir}") logger.info(f"📝 Output: {output_path}") logger.info(f"📝 Type: {output_type}") # Check for SafeTensors files safetensor_files = list(input_dir.glob("*.safetensors")) if not safetensor_files: logger.warning("⚠️ No SafeTensors files found in model directory") return False try: # Load model configuration config_parser = ConfigParser() model_config = config_parser.load_model_config(input_dir) # Get architecture mapping arch_name = model_config.architectures[0] if model_config.architectures else "llama" arch = config_parser.get_architecture_mapping(arch_name) if arch != arch_name: logger.info(f"📝 Architecture mapping: {arch_name} → {arch}") # Convert using GGUFConverter tensor_mapper = TensorMapper() success = GGUFConverter.convert_safetensors( input_dir, output_path, model_config, arch, tensor_mapper ) except Exception as e: logger.error(f"❌ Conversion failed with exception: {e}") return False else: if success: logger.info("✅ Native Python conversion successful") return success