295 lines
10 KiB
Python
295 lines
10 KiB
Python
"""Direct llama.cpp binary execution service.
|
|
|
|
Provides direct execution of llama.cpp quantisation binary with proper
|
|
tensor-specific override support for L and XL variants.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import platform
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
|
|
from helpers.logger import logger
|
|
from helpers.services.binary_manager import BinaryManager
|
|
from helpers.services.filesystem import FilesystemService
|
|
|
|
if TYPE_CHECKING:
|
|
from helpers.models.quantisation import QuantisationConfig
|
|
|
|
|
|
class QuantisationExecutor:
|
|
"""Executes llama.cpp quantisation with tensor overrides.
|
|
|
|
Provides direct binary execution with proper command-line flags for
|
|
tensor-specific overrides, supporting Bartowski-style L and XL variants.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise quantisation executor."""
|
|
self.fs = FilesystemService()
|
|
self.binary_manager = BinaryManager()
|
|
self.quantise_binary = self._get_quantise_binary()
|
|
self.last_error: str | None = None # Track last error type
|
|
|
|
def _get_quantise_binary(self) -> Path | None:
|
|
"""Get llama-quantize binary, downloading if necessary.
|
|
|
|
Returns:
|
|
Path to binary if found, None otherwise.
|
|
"""
|
|
# First check local directory for manual placement
|
|
local_binary = Path("./llama-quantize")
|
|
if local_binary.exists():
|
|
logger.info(f"Using local llama-quantize binary: {local_binary}")
|
|
return local_binary
|
|
|
|
# Download from GitHub releases
|
|
binary_path = self.binary_manager.get_quantise_binary()
|
|
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
|
logger.info(f"Using llama-quantize binary: {binary_path}")
|
|
return binary_path
|
|
|
|
logger.error("Failed to obtain llama-quantize binary")
|
|
logger.info(
|
|
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
|
|
)
|
|
return None
|
|
|
|
def execute_quantisation(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
config: QuantisationConfig,
|
|
imatrix_path: Path | None = None,
|
|
) -> bool:
|
|
"""Execute quantisation using llama.cpp binary.
|
|
|
|
Builds and executes llama-quantize command with proper tensor override
|
|
flags for L and XL variants.
|
|
|
|
Returns:
|
|
True if quantisation successful, False otherwise.
|
|
"""
|
|
if not self.quantise_binary:
|
|
logger.error("llama-quantize binary not available")
|
|
return False
|
|
|
|
# Build command
|
|
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
|
|
|
|
# Execute with real-time output
|
|
return self._execute_command(cmd)
|
|
|
|
def _build_quantisation_command(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
config: QuantisationConfig,
|
|
imatrix_path: Path | None,
|
|
) -> list[str]:
|
|
"""Build llama-quantize command with tensor overrides.
|
|
|
|
Returns:
|
|
Command arguments as list.
|
|
"""
|
|
cmd = [str(self.quantise_binary)]
|
|
|
|
# Add imatrix if available
|
|
if imatrix_path:
|
|
cmd.extend(["--imatrix", str(imatrix_path)])
|
|
if imatrix_path.exists():
|
|
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
|
|
|
|
# Add tensor-specific overrides for L and XL variants
|
|
if config.embedding_type:
|
|
# Use directly from config - already in correct format
|
|
cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
|
|
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
|
|
|
|
if config.output_type:
|
|
# Use directly from config - already in correct format
|
|
cmd.extend(["--output-tensor-type", config.output_type.lower()])
|
|
logger.info(f"⚙️ Output tensor type: {config.output_type}")
|
|
|
|
# Note: Per-layer tensor overrides could be added here if needed in future
|
|
# For now, embedding and output overrides handle the L/XL variants
|
|
|
|
# Get base quantisation type
|
|
base_quant = self._get_base_quantisation_type(config.name)
|
|
|
|
# Add input, output, and base quantisation type
|
|
cmd.extend([str(input_path), str(output_path), base_quant])
|
|
|
|
return cmd
|
|
|
|
def _get_base_quantisation_type(self, config_name: str) -> str:
|
|
"""Get base quantisation type for a config.
|
|
|
|
Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).
|
|
|
|
Returns:
|
|
Base quantisation type string.
|
|
"""
|
|
# Mapping of custom variants to base types
|
|
variant_mapping = {
|
|
"Q3_K_L": "Q3_K_M",
|
|
"Q3_K_XL": "Q3_K_M",
|
|
"Q4_K_L": "Q4_K_M",
|
|
"Q4_K_XL": "Q4_K_M",
|
|
"Q5_K_L": "Q5_K_M",
|
|
"Q5_K_XL": "Q5_K_M",
|
|
"Q6_K_L": "Q6_K",
|
|
"Q6_K_XL": "Q6_K",
|
|
}
|
|
|
|
return variant_mapping.get(config_name, config_name)
|
|
|
|
def _execute_command(self, cmd: list[str]) -> bool:
|
|
"""Execute command with real-time output streaming.
|
|
|
|
Returns:
|
|
True if successful, False otherwise.
|
|
"""
|
|
logger.info(f"💻 Running: {' '.join(cmd)}")
|
|
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
|
|
|
|
# Set LD_LIBRARY_PATH for shared libraries
|
|
env = os.environ.copy()
|
|
if platform.system() != "Windows":
|
|
lib_path = str(self.binary_manager.BINARY_DIR)
|
|
if "LD_LIBRARY_PATH" in env:
|
|
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
|
else:
|
|
env["LD_LIBRARY_PATH"] = lib_path
|
|
|
|
# Track output for architecture detection
|
|
output_lines = []
|
|
architecture_error = False
|
|
|
|
try:
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
universal_newlines=True,
|
|
bufsize=1,
|
|
env=env,
|
|
)
|
|
|
|
# Stream output
|
|
while True:
|
|
if process.stdout is not None:
|
|
output = process.stdout.readline()
|
|
else:
|
|
break
|
|
if not output and process.poll() is not None:
|
|
break
|
|
if output:
|
|
output_stripped = output.strip()
|
|
logger.info(f"📊 {output_stripped}")
|
|
output_lines.append(output_stripped)
|
|
|
|
# Check for architecture-related errors
|
|
if any(
|
|
phrase in output_stripped.lower()
|
|
for phrase in [
|
|
"unsupported architecture",
|
|
"unknown architecture",
|
|
"architecture not supported",
|
|
"model architecture",
|
|
"llama_model_load: error loading model",
|
|
]
|
|
):
|
|
architecture_error = True
|
|
|
|
return_code = process.poll()
|
|
if return_code == 0:
|
|
logger.info("✅ Quantisation successful!")
|
|
return True
|
|
|
|
# Check if this was an architecture error
|
|
if architecture_error or return_code == 1:
|
|
# Look for architecture info in recent output
|
|
for line in output_lines[-10:]: # Check last 10 lines
|
|
if "architecture" in line.lower():
|
|
logger.error("❌ Architecture not supported by llama.cpp")
|
|
logger.error(" so cannot be quantised with current llama.cpp but")
|
|
logger.error(" F16 GGUF file can be used for inference if supported")
|
|
# Store this for the orchestrator to detect
|
|
self.last_error = "unsupported_architecture"
|
|
return False
|
|
|
|
logger.error(f"❌ Quantisation failed with return code {return_code}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Quantisation failed with exception: {e}")
|
|
return False
|
|
else:
|
|
return False
|
|
|
|
|
|
class IMatrixHandler:
|
|
"""Handles importance matrix file management.
|
|
|
|
Manages detection and use of existing importance matrix files for
|
|
quantisation guidance.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise IMatrixHandler."""
|
|
self.fs = FilesystemService()
|
|
|
|
def find_imatrix(self, model_dir: Path) -> Path | None:
|
|
"""Find existing imatrix file in model directory.
|
|
|
|
Returns:
|
|
Path to imatrix file if found, None otherwise.
|
|
"""
|
|
imatrix_path = model_dir / "imatrix.dat"
|
|
|
|
if imatrix_path.exists():
|
|
file_size = self.fs.get_file_size(imatrix_path)
|
|
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
|
|
return imatrix_path
|
|
|
|
return None
|
|
|
|
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
|
|
"""Prompt user for existing imatrix file.
|
|
|
|
Returns:
|
|
Path to user-provided imatrix, or None if not available.
|
|
"""
|
|
imatrix_path = model_dir / "imatrix.dat"
|
|
|
|
logger.info(f"Model directory: {model_dir}")
|
|
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
|
logger.info(
|
|
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
|
|
)
|
|
logger.info(
|
|
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
|
|
)
|
|
|
|
response = (
|
|
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
|
|
.strip()
|
|
.lower()
|
|
)
|
|
|
|
if response != "y":
|
|
return None
|
|
|
|
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
|
|
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
|
|
|
|
if imatrix_path.exists():
|
|
file_size = self.fs.get_file_size(imatrix_path)
|
|
logger.info(f"Found imatrix file! ({file_size})")
|
|
return imatrix_path
|
|
|
|
logger.warning("No imatrix.dat file found - continuing without imatrix")
|
|
return None
|