llm-gguf-tools/helpers/llama_cpp/quantiser.py
2025-08-09 17:16:02 +01:00

219 lines
7.5 KiB
Python

"""Direct llama.cpp quantisation execution.
Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.filesystem import FilesystemService
from helpers.llama_cpp.binary_manager import BinaryManager
from helpers.logger import logger
if TYPE_CHECKING:
from helpers.models.quantisation import QuantisationConfig
class QuantisationExecutor:
"""Executes llama.cpp quantisation with tensor overrides.
Provides direct binary execution with proper command-line flags for
tensor-specific overrides, supporting Bartowski-style L and XL variants.
"""
def __init__(self) -> None:
"""Initialise quantisation executor."""
self.fs = FilesystemService()
self.binary_manager = BinaryManager()
self.quantise_binary = self._get_quantise_binary()
self.last_error: str | None = None # Track last error type
def _get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-quantize")
if local_binary.exists():
logger.info(f"Using local llama-quantize binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_quantise_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-quantize binary: {binary_path}")
return binary_path
logger.error("Failed to obtain llama-quantize binary")
logger.info(
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
)
return None
def execute_quantisation(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None = None,
) -> bool:
"""Execute quantisation using llama.cpp binary.
Builds and executes llama-quantize command with proper tensor override
flags for L and XL variants.
Returns:
True if quantisation successful, False otherwise.
"""
if not self.quantise_binary:
logger.error("llama-quantize binary not available")
return False
# Build command
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
# Execute with real-time output
return self._execute_command(cmd)
def _build_quantisation_command(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None,
) -> list[str]:
"""Build llama-quantize command with tensor overrides.
Returns:
Command arguments as list.
"""
cmd = [str(self.quantise_binary)]
# Add imatrix if available
if imatrix_path:
cmd.extend(["--imatrix", str(imatrix_path)])
# Add tensor overrides for L and XL variants
if config.output_type:
cmd.extend(["--output-tensor-type", config.output_type])
if config.embedding_type:
cmd.extend(["--token-embedding-type", config.embedding_type])
# Add input, output, and quantisation type
cmd.extend([str(input_path), str(output_path), config.base_type])
return cmd
def _setup_environment(self) -> dict[str, str]:
"""Set up environment variables for quantisation command.
Returns:
Environment dictionary with necessary library paths.
"""
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
return env
def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]:
"""Process subprocess output stream and detect errors.
Returns:
Tuple of (output_lines, architecture_error_detected).
"""
output_lines = []
architecture_error = False
if process.stdout:
for line in iter(process.stdout.readline, ""):
if line:
cleaned_line = line.rstrip()
output_lines.append(cleaned_line)
logger.info(f" {cleaned_line}")
# Check for architecture errors
if any(
error_text in cleaned_line.lower()
for error_text in [
"unknown model architecture",
"unsupported architecture",
"unknown architecture",
"architecture not supported",
"model architecture",
"llama_model_load: error loading model",
]
):
architecture_error = True
return output_lines, architecture_error
def _handle_architecture_error(self, output_lines: list[str]) -> bool:
"""Handle architecture-related errors by checking output.
Returns:
True if architecture error was detected and handled.
"""
# Look for architecture info in recent output
for line in output_lines[-10:]: # Check last 10 lines
if "architecture" in line.lower():
logger.error("❌ Architecture not supported by llama.cpp")
logger.error(" so cannot be quantised with current llama.cpp but")
logger.error(" F16 GGUF file can be used for inference if supported")
# Store this for the orchestrator to detect
self.last_error = "unsupported_architecture"
return True
return False
def _execute_command(self, cmd: list[str]) -> bool:
"""Execute command with real-time output streaming.
Returns:
True if successful, False otherwise.
"""
try:
logger.info(f"🔧 Executing: {' '.join(cmd)}")
env = self._setup_environment()
# Execute with real-time output
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
universal_newlines=True,
env=env,
)
output_lines, architecture_error = self._process_output_stream(process)
return_code = process.poll()
if return_code == 0:
logger.info("✅ Quantisation successful!")
return True
# Check if this was an architecture error
if (architecture_error or return_code == 1) and self._handle_architecture_error(
output_lines
):
return False
logger.error(f"❌ Quantisation failed with return code {return_code}")
except Exception as e:
logger.error(f"❌ Quantisation failed with exception: {e}")
return False