llm-gguf-tools/helpers/services/quantisation.py
2025-08-07 18:29:12 +01:00

486 lines
17 KiB
Python

"""Quantisation operations service.
Provides modular quantisation engine, model management, and upload capabilities
for GGUF model processing. Consolidates quantisation logic from various tools
into reusable components following SOLID principles.
"""
from __future__ import annotations
import shutil
import subprocess
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import LlamaCppEnvironment
from helpers.services.llama_cpp import EnvironmentManager
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Encapsulates llama-quantize binary interactions with real-time output.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation with primary and fallback methods, handling
tensor-specific precision overrides and importance matrix guidance.
Returns:
QuantisationResult with success status and file information.
"""
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.info(f"📝 Source: {context.f16_model_path}")
logger.info(f"📝 Target: {output_path}")
# Try primary method
if self._try_quantisation_method(
context, output_path, context.config.tensor_types, "method 1"
):
return self._create_success_result(context.config.name, output_path, "method 1")
# Try fallback methods
for i, fallback_method in enumerate(context.config.fallback_methods, 2):
method_name = f"method {i}"
if self._try_quantisation_method(context, output_path, fallback_method, method_name):
return self._create_success_result(context.config.name, output_path, method_name)
logger.error("All %s quantisation methods failed", context.config.name)
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="All quantisation methods failed",
)
def _try_quantisation_method(
self,
context: QuantisationContext,
output_path: Path,
tensor_config: dict[str, str],
method_name: str,
) -> bool:
"""Try a specific quantisation method with real-time output.
Builds and executes llama-quantize command with appropriate parameters,
streaming output for progress monitoring.
Returns:
True if quantisation successful, False otherwise.
"""
logger.info(f"🔍 Trying {method_name}...")
cmd = self._build_quantisation_command(context, output_path, tensor_config)
return self._execute_quantisation_command(cmd, method_name)
def _build_quantisation_command(
self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
) -> list[str]:
"""Build quantisation command with all required parameters.
Returns:
List of command arguments.
"""
cmd = [str(context.llama_env.quantise_binary)]
# Add imatrix if available
if context.imatrix_path and context.imatrix_path.exists():
cmd.extend(["--imatrix", str(context.imatrix_path)])
logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
# Add tensor type arguments
self._add_tensor_type_arguments(cmd, tensor_config)
cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
return cmd
def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
"""Add tensor type arguments to command."""
if not tensor_config:
return
for tensor_name, quant_type in tensor_config.items():
if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
cmd.extend([f"--{tensor_name}", quant_type])
else:
cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
"""Execute quantisation command with real-time output.
Returns:
True if quantisation successful, False otherwise.
"""
logger.info(f"💻 Running: {' '.join(cmd)}")
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)
self._stream_quantisation_output(process)
return_code = process.poll()
if return_code == 0:
logger.info(f"{method_name} quantisation successful!")
return True
except Exception as e:
logger.info(f"{method_name} failed with exception: {e}")
return False
else:
logger.info(f"{method_name} failed with return code {return_code}")
return False
def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
"""Stream quantisation output in real-time."""
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
logger.info(f"📊 {output.strip()}")
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)
class ModelManager:
"""Handles model downloading and preparation for quantisation.
Manages both GGUF repository downloads and HuggingFace model conversions,
providing unified interface for model acquisition and preparation.
"""
def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
"""Initialise model manager with storage and environment configuration.
Sets up model storage directory and links to environment manager for
conversion script access and llama.cpp tool discovery.
"""
self.models_dir = models_dir
self.environment_manager = environment_manager
self.fs = FilesystemService()
def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
"""Prepare model for quantisation and return F16 model path.
Handles both GGUF repository downloads and regular HuggingFace model
conversion workflows with automatic format detection.
Returns:
Path to F16 GGUF model ready for quantisation.
"""
model_dir = self.models_dir / model_source.model_name
if model_source.is_gguf_repo:
return self._handle_gguf_repo(model_source, model_dir)
return self._handle_regular_repo(model_source, model_dir, llama_env)
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Handle GGUF repository download with pattern matching.
Downloads GGUF files matching specified patterns, prioritising
multi-part files and F16 variants.
Returns:
Path to downloaded or existing GGUF file.
"""
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
return f16_model
# Check for existing GGUF files
model_dir.mkdir(parents=True, exist_ok=True)
existing_gguf = self.fs.find_gguf_files(model_dir)
if existing_gguf:
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
return existing_gguf[0]
# Download with patterns
downloaded_file = self._download_gguf_with_patterns(
model_source.source_model, model_source.gguf_file_pattern, model_dir
)
if downloaded_file:
# Handle multi-part files
if "00001-of-" in downloaded_file.name:
return downloaded_file
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
"-00003-of-", "-00001-of-"
)
first_part = downloaded_file.parent / base_name
if first_part.exists():
logger.info(f"🔄 Using first part: {first_part.name}")
return first_part
# Rename single file to standard name
downloaded_file.rename(f16_model)
return f16_model
# Fallback to regular conversion
logger.info("💡 Falling back to downloading full repository and converting...")
return self._handle_regular_repo(
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
model_dir,
None,
)
def _download_gguf_with_patterns(
self, source_model: str, pattern: str | None, model_dir: Path
) -> Path | None:
"""Download GGUF file using various pattern strategies.
Tries multiple pattern variations to find and download appropriate
GGUF files, handling timeouts and temporary directories.
Returns:
Path to downloaded file, or None if all patterns fail.
"""
if pattern:
patterns = [
f"*{pattern}*",
f"*{pattern.lower()}*",
f"*{pattern.upper()}*",
"*f16*",
"*F16*",
"*fp16*",
]
else:
patterns = ["*f16*", "*F16*", "*fp16*"]
temp_dir = model_dir / "gguf_temp"
for search_pattern in patterns:
logger.info(f"🔍 Trying pattern: {search_pattern}")
temp_dir.mkdir(exist_ok=True)
try:
subprocess.run(
[
"timeout",
"300",
"huggingface-cli",
"download",
source_model,
"--include",
search_pattern,
"--local-dir",
str(temp_dir),
],
check=True,
capture_output=True,
)
# Find downloaded GGUF files
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
if gguf_files:
found_file = gguf_files[0]
logger.info(f"✅ Found GGUF file: {found_file.name}")
# Move to parent directory
final_path = model_dir / found_file.name
shutil.move(str(found_file), str(final_path))
shutil.rmtree(temp_dir)
return final_path
except subprocess.CalledProcessError:
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
continue
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return None
def _handle_regular_repo(
self,
model_source: ModelSource,
model_dir: Path,
llama_env: LlamaCppEnvironment | None,
) -> Path:
"""Handle regular HuggingFace repository conversion.
Downloads full model repository and converts to F16 GGUF format
using llama.cpp conversion scripts.
Returns:
Path to converted F16 GGUF model.
"""
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
if not model_dir.exists():
subprocess.run(
[
"huggingface-cli",
"download",
model_source.source_model,
"--local-dir",
str(model_dir),
],
check=True,
)
else:
logger.info("✅ Model already downloaded")
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if not f16_model.exists():
if not llama_env:
llama_env = self.environment_manager.setup()
# Ensure conversion script is available
if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
logger.info("Getting conversion script from llama.cpp repository...")
llama_env = self.environment_manager.setup_repository()
subprocess.run(
[
*llama_env.convert_script.split(),
str(model_dir),
"--outtype",
"f16",
"--outfile",
str(f16_model),
],
check=True,
)
else:
logger.info("✅ F16 model already exists")
return f16_model
class HuggingFaceUploader:
"""Handles uploading models and documentation to HuggingFace.
Provides methods for repository creation, file uploads, and README
updates with proper error handling and retry logic.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Returns:
HuggingFace username from CLI authentication.
Raises:
RuntimeError: If not authenticated.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
"""
logger.info("Uploading README...")
try:
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
"--create",
],
check=True,
capture_output=True,
)
logger.info("README uploaded")
except subprocess.CalledProcessError:
# Repository exists, update without --create
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
],
check=True,
)
logger.info("README updated")
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
"""Upload model file to repository.
Uploads GGUF model file to specified repository path.
"""
logger.info(f"Uploading {model_path.name}...")
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(model_path),
model_path.name,
],
check=True,
)
logger.info(f"{model_path.name} uploaded")