llm-gguf-tools/helpers/services/llama_cpp.py
2025-08-09 10:55:42 +01:00

295 lines
10 KiB
Python

"""Direct llama.cpp binary execution service.
Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.services.binary_manager import BinaryManager
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from helpers.models.quantisation import QuantisationConfig
class QuantisationExecutor:
"""Executes llama.cpp quantisation with tensor overrides.
Provides direct binary execution with proper command-line flags for
tensor-specific overrides, supporting Bartowski-style L and XL variants.
"""
def __init__(self) -> None:
"""Initialise quantisation executor."""
self.fs = FilesystemService()
self.binary_manager = BinaryManager()
self.quantise_binary = self._get_quantise_binary()
self.last_error: str | None = None # Track last error type
def _get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-quantize")
if local_binary.exists():
logger.info(f"Using local llama-quantize binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_quantise_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-quantize binary: {binary_path}")
return binary_path
logger.error("Failed to obtain llama-quantize binary")
logger.info(
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
)
return None
def execute_quantisation(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None = None,
) -> bool:
"""Execute quantisation using llama.cpp binary.
Builds and executes llama-quantize command with proper tensor override
flags for L and XL variants.
Returns:
True if quantisation successful, False otherwise.
"""
if not self.quantise_binary:
logger.error("llama-quantize binary not available")
return False
# Build command
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
# Execute with real-time output
return self._execute_command(cmd)
def _build_quantisation_command(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None,
) -> list[str]:
"""Build llama-quantize command with tensor overrides.
Returns:
Command arguments as list.
"""
cmd = [str(self.quantise_binary)]
# Add imatrix if available
if imatrix_path:
cmd.extend(["--imatrix", str(imatrix_path)])
if imatrix_path.exists():
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
# Add tensor-specific overrides for L and XL variants
if config.embedding_type:
# Use directly from config - already in correct format
cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
if config.output_type:
# Use directly from config - already in correct format
cmd.extend(["--output-tensor-type", config.output_type.lower()])
logger.info(f"⚙️ Output tensor type: {config.output_type}")
# Note: Per-layer tensor overrides could be added here if needed in future
# For now, embedding and output overrides handle the L/XL variants
# Get base quantisation type
base_quant = self._get_base_quantisation_type(config.name)
# Add input, output, and base quantisation type
cmd.extend([str(input_path), str(output_path), base_quant])
return cmd
def _get_base_quantisation_type(self, config_name: str) -> str:
"""Get base quantisation type for a config.
Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).
Returns:
Base quantisation type string.
"""
# Mapping of custom variants to base types
variant_mapping = {
"Q3_K_L": "Q3_K_M",
"Q3_K_XL": "Q3_K_M",
"Q4_K_L": "Q4_K_M",
"Q4_K_XL": "Q4_K_M",
"Q5_K_L": "Q5_K_M",
"Q5_K_XL": "Q5_K_M",
"Q6_K_L": "Q6_K",
"Q6_K_XL": "Q6_K",
}
return variant_mapping.get(config_name, config_name)
def _execute_command(self, cmd: list[str]) -> bool:
"""Execute command with real-time output streaming.
Returns:
True if successful, False otherwise.
"""
logger.info(f"💻 Running: {' '.join(cmd)}")
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
# Set LD_LIBRARY_PATH for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
# Track output for architecture detection
output_lines = []
architecture_error = False
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
env=env,
)
# Stream output
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
output_stripped = output.strip()
logger.info(f"📊 {output_stripped}")
output_lines.append(output_stripped)
# Check for architecture-related errors
if any(
phrase in output_stripped.lower()
for phrase in [
"unsupported architecture",
"unknown architecture",
"architecture not supported",
"model architecture",
"llama_model_load: error loading model",
]
):
architecture_error = True
return_code = process.poll()
if return_code == 0:
logger.info("✅ Quantisation successful!")
return True
# Check if this was an architecture error
if architecture_error or return_code == 1:
# Look for architecture info in recent output
for line in output_lines[-10:]: # Check last 10 lines
if "architecture" in line.lower():
logger.error("❌ Architecture not supported by llama.cpp")
logger.error(" so cannot be quantised with current llama.cpp but")
logger.error(" F16 GGUF file can be used for inference if supported")
# Store this for the orchestrator to detect
self.last_error = "unsupported_architecture"
return False
logger.error(f"❌ Quantisation failed with return code {return_code}")
except Exception as e:
logger.error(f"❌ Quantisation failed with exception: {e}")
return False
else:
return False
class IMatrixHandler:
"""Handles importance matrix file management.
Manages detection and use of existing importance matrix files for
quantisation guidance.
"""
def __init__(self) -> None:
"""Initialise IMatrixHandler."""
self.fs = FilesystemService()
def find_imatrix(self, model_dir: Path) -> Path | None:
"""Find existing imatrix file in model directory.
Returns:
Path to imatrix file if found, None otherwise.
"""
imatrix_path = model_dir / "imatrix.dat"
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
return imatrix_path
return None
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
"""Prompt user for existing imatrix file.
Returns:
Path to user-provided imatrix, or None if not available.
"""
imatrix_path = model_dir / "imatrix.dat"
logger.info(f"Model directory: {model_dir}")
logger.info(f"Looking for imatrix file at: {imatrix_path}")
logger.info(
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
)
logger.info(
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
)
response = (
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
.strip()
.lower()
)
if response != "y":
return None
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found imatrix file! ({file_size})")
return imatrix_path
logger.warning("No imatrix.dat file found - continuing without imatrix")
return None