219 lines
7.5 KiB
Python
219 lines
7.5 KiB
Python
"""Direct llama.cpp quantisation execution.
|
|
|
|
Provides direct execution of llama.cpp quantisation binary with proper
|
|
tensor-specific override support for L and XL variants.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import platform
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
|
|
from helpers.filesystem import FilesystemService
|
|
from helpers.llama_cpp.binary_manager import BinaryManager
|
|
from helpers.logger import logger
|
|
|
|
if TYPE_CHECKING:
|
|
from helpers.models.quantisation import QuantisationConfig
|
|
|
|
|
|
class QuantisationExecutor:
|
|
"""Executes llama.cpp quantisation with tensor overrides.
|
|
|
|
Provides direct binary execution with proper command-line flags for
|
|
tensor-specific overrides, supporting Bartowski-style L and XL variants.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise quantisation executor."""
|
|
self.fs = FilesystemService()
|
|
self.binary_manager = BinaryManager()
|
|
self.quantise_binary = self._get_quantise_binary()
|
|
self.last_error: str | None = None # Track last error type
|
|
|
|
def _get_quantise_binary(self) -> Path | None:
|
|
"""Get llama-quantize binary, downloading if necessary.
|
|
|
|
Returns:
|
|
Path to binary if found, None otherwise.
|
|
"""
|
|
# First check local directory for manual placement
|
|
local_binary = Path("./llama-quantize")
|
|
if local_binary.exists():
|
|
logger.info(f"Using local llama-quantize binary: {local_binary}")
|
|
return local_binary
|
|
|
|
# Download from GitHub releases
|
|
binary_path = self.binary_manager.get_quantise_binary()
|
|
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
|
logger.info(f"Using llama-quantize binary: {binary_path}")
|
|
return binary_path
|
|
|
|
logger.error("Failed to obtain llama-quantize binary")
|
|
logger.info(
|
|
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
|
|
)
|
|
return None
|
|
|
|
def execute_quantisation(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
config: QuantisationConfig,
|
|
imatrix_path: Path | None = None,
|
|
) -> bool:
|
|
"""Execute quantisation using llama.cpp binary.
|
|
|
|
Builds and executes llama-quantize command with proper tensor override
|
|
flags for L and XL variants.
|
|
|
|
Returns:
|
|
True if quantisation successful, False otherwise.
|
|
"""
|
|
if not self.quantise_binary:
|
|
logger.error("llama-quantize binary not available")
|
|
return False
|
|
|
|
# Build command
|
|
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
|
|
|
|
# Execute with real-time output
|
|
return self._execute_command(cmd)
|
|
|
|
def _build_quantisation_command(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
config: QuantisationConfig,
|
|
imatrix_path: Path | None,
|
|
) -> list[str]:
|
|
"""Build llama-quantize command with tensor overrides.
|
|
|
|
Returns:
|
|
Command arguments as list.
|
|
"""
|
|
cmd = [str(self.quantise_binary)]
|
|
|
|
# Add imatrix if available
|
|
if imatrix_path:
|
|
cmd.extend(["--imatrix", str(imatrix_path)])
|
|
|
|
# Add tensor overrides for L and XL variants
|
|
if config.output_type:
|
|
cmd.extend(["--output-tensor-type", config.output_type])
|
|
if config.embedding_type:
|
|
cmd.extend(["--token-embedding-type", config.embedding_type])
|
|
|
|
# Add input, output, and quantisation type
|
|
cmd.extend([str(input_path), str(output_path), config.base_type])
|
|
|
|
return cmd
|
|
|
|
def _setup_environment(self) -> dict[str, str]:
|
|
"""Set up environment variables for quantisation command.
|
|
|
|
Returns:
|
|
Environment dictionary with necessary library paths.
|
|
"""
|
|
env = os.environ.copy()
|
|
if platform.system() != "Windows":
|
|
lib_path = str(self.binary_manager.BINARY_DIR)
|
|
if "LD_LIBRARY_PATH" in env:
|
|
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
|
else:
|
|
env["LD_LIBRARY_PATH"] = lib_path
|
|
return env
|
|
|
|
def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]:
|
|
"""Process subprocess output stream and detect errors.
|
|
|
|
Returns:
|
|
Tuple of (output_lines, architecture_error_detected).
|
|
"""
|
|
output_lines = []
|
|
architecture_error = False
|
|
|
|
if process.stdout:
|
|
for line in iter(process.stdout.readline, ""):
|
|
if line:
|
|
cleaned_line = line.rstrip()
|
|
output_lines.append(cleaned_line)
|
|
logger.info(f" {cleaned_line}")
|
|
|
|
# Check for architecture errors
|
|
if any(
|
|
error_text in cleaned_line.lower()
|
|
for error_text in [
|
|
"unknown model architecture",
|
|
"unsupported architecture",
|
|
"unknown architecture",
|
|
"architecture not supported",
|
|
"model architecture",
|
|
"llama_model_load: error loading model",
|
|
]
|
|
):
|
|
architecture_error = True
|
|
|
|
return output_lines, architecture_error
|
|
|
|
def _handle_architecture_error(self, output_lines: list[str]) -> bool:
|
|
"""Handle architecture-related errors by checking output.
|
|
|
|
Returns:
|
|
True if architecture error was detected and handled.
|
|
"""
|
|
# Look for architecture info in recent output
|
|
for line in output_lines[-10:]: # Check last 10 lines
|
|
if "architecture" in line.lower():
|
|
logger.error("❌ Architecture not supported by llama.cpp")
|
|
logger.error(" so cannot be quantised with current llama.cpp but")
|
|
logger.error(" F16 GGUF file can be used for inference if supported")
|
|
# Store this for the orchestrator to detect
|
|
self.last_error = "unsupported_architecture"
|
|
return True
|
|
return False
|
|
|
|
def _execute_command(self, cmd: list[str]) -> bool:
|
|
"""Execute command with real-time output streaming.
|
|
|
|
Returns:
|
|
True if successful, False otherwise.
|
|
"""
|
|
try:
|
|
logger.info(f"🔧 Executing: {' '.join(cmd)}")
|
|
|
|
env = self._setup_environment()
|
|
|
|
# Execute with real-time output
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
bufsize=1,
|
|
universal_newlines=True,
|
|
env=env,
|
|
)
|
|
|
|
output_lines, architecture_error = self._process_output_stream(process)
|
|
|
|
return_code = process.poll()
|
|
if return_code == 0:
|
|
logger.info("✅ Quantisation successful!")
|
|
return True
|
|
|
|
# Check if this was an architecture error
|
|
if (architecture_error or return_code == 1) and self._handle_architecture_error(
|
|
output_lines
|
|
):
|
|
return False
|
|
|
|
logger.error(f"❌ Quantisation failed with return code {return_code}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Quantisation failed with exception: {e}")
|
|
|
|
return False
|