486 lines
17 KiB
Python
486 lines
17 KiB
Python
"""Quantisation operations service.
|
|
|
|
Provides modular quantisation engine, model management, and upload capabilities
|
|
for GGUF model processing. Consolidates quantisation logic from various tools
|
|
into reusable components following SOLID principles.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import shutil
|
|
import subprocess
|
|
from typing import TYPE_CHECKING
|
|
|
|
from helpers.logger import logger
|
|
from helpers.models.quantisation import (
|
|
ModelSource,
|
|
QuantisationContext,
|
|
QuantisationResult,
|
|
QuantisationType,
|
|
)
|
|
from helpers.services.filesystem import FilesystemService
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
from helpers.models.quantisation import LlamaCppEnvironment
|
|
from helpers.services.llama_cpp import EnvironmentManager
|
|
|
|
|
|
class QuantisationEngine:
|
|
"""Handles the actual quantisation process with configurable methods.
|
|
|
|
Provides flexible quantisation execution supporting multiple tensor
|
|
precision configurations, importance matrices, and fallback strategies.
|
|
Encapsulates llama-quantize binary interactions with real-time output.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise quantisation engine."""
|
|
self.fs = FilesystemService()
|
|
|
|
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
|
"""Perform quantisation using the specified configuration.
|
|
|
|
Executes quantisation with primary and fallback methods, handling
|
|
tensor-specific precision overrides and importance matrix guidance.
|
|
|
|
Returns:
|
|
QuantisationResult with success status and file information.
|
|
"""
|
|
logger.info(
|
|
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
|
)
|
|
|
|
output_path = context.get_output_path()
|
|
|
|
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
|
logger.info(f"📝 Source: {context.f16_model_path}")
|
|
logger.info(f"📝 Target: {output_path}")
|
|
|
|
# Try primary method
|
|
if self._try_quantisation_method(
|
|
context, output_path, context.config.tensor_types, "method 1"
|
|
):
|
|
return self._create_success_result(context.config.name, output_path, "method 1")
|
|
|
|
# Try fallback methods
|
|
for i, fallback_method in enumerate(context.config.fallback_methods, 2):
|
|
method_name = f"method {i}"
|
|
if self._try_quantisation_method(context, output_path, fallback_method, method_name):
|
|
return self._create_success_result(context.config.name, output_path, method_name)
|
|
|
|
logger.error("All %s quantisation methods failed", context.config.name)
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(context.config.name),
|
|
success=False,
|
|
error_message="All quantisation methods failed",
|
|
)
|
|
|
|
def _try_quantisation_method(
|
|
self,
|
|
context: QuantisationContext,
|
|
output_path: Path,
|
|
tensor_config: dict[str, str],
|
|
method_name: str,
|
|
) -> bool:
|
|
"""Try a specific quantisation method with real-time output.
|
|
|
|
Builds and executes llama-quantize command with appropriate parameters,
|
|
streaming output for progress monitoring.
|
|
|
|
Returns:
|
|
True if quantisation successful, False otherwise.
|
|
"""
|
|
logger.info(f"🔍 Trying {method_name}...")
|
|
|
|
cmd = self._build_quantisation_command(context, output_path, tensor_config)
|
|
return self._execute_quantisation_command(cmd, method_name)
|
|
|
|
def _build_quantisation_command(
|
|
self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
|
|
) -> list[str]:
|
|
"""Build quantisation command with all required parameters.
|
|
|
|
Returns:
|
|
List of command arguments.
|
|
"""
|
|
cmd = [str(context.llama_env.quantise_binary)]
|
|
|
|
# Add imatrix if available
|
|
if context.imatrix_path and context.imatrix_path.exists():
|
|
cmd.extend(["--imatrix", str(context.imatrix_path)])
|
|
logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
|
|
|
|
# Add tensor type arguments
|
|
self._add_tensor_type_arguments(cmd, tensor_config)
|
|
|
|
cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
|
|
return cmd
|
|
|
|
def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
|
|
"""Add tensor type arguments to command."""
|
|
if not tensor_config:
|
|
return
|
|
|
|
for tensor_name, quant_type in tensor_config.items():
|
|
if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
|
|
cmd.extend([f"--{tensor_name}", quant_type])
|
|
else:
|
|
cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
|
|
|
|
def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
|
|
"""Execute quantisation command with real-time output.
|
|
|
|
Returns:
|
|
True if quantisation successful, False otherwise.
|
|
"""
|
|
logger.info(f"💻 Running: {' '.join(cmd)}")
|
|
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
|
|
|
|
try:
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
universal_newlines=True,
|
|
bufsize=1,
|
|
)
|
|
|
|
self._stream_quantisation_output(process)
|
|
|
|
return_code = process.poll()
|
|
if return_code == 0:
|
|
logger.info(f"✅ {method_name} quantisation successful!")
|
|
return True
|
|
except Exception as e:
|
|
logger.info(f"❌ {method_name} failed with exception: {e}")
|
|
return False
|
|
else:
|
|
logger.info(f"❌ {method_name} failed with return code {return_code}")
|
|
return False
|
|
|
|
def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
|
|
"""Stream quantisation output in real-time."""
|
|
while True:
|
|
if process.stdout is not None:
|
|
output = process.stdout.readline()
|
|
else:
|
|
break
|
|
if not output and process.poll() is not None:
|
|
break
|
|
if output:
|
|
logger.info(f"📊 {output.strip()}")
|
|
|
|
def _create_success_result(
|
|
self, quant_type: str, output_path: Path, method_used: str
|
|
) -> QuantisationResult:
|
|
"""Create successful quantisation result with file metadata.
|
|
|
|
Returns:
|
|
QuantisationResult with file path and size information.
|
|
"""
|
|
file_size = self.fs.get_file_size(output_path)
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(quant_type),
|
|
success=True,
|
|
file_path=output_path,
|
|
file_size=file_size,
|
|
method_used=method_used,
|
|
)
|
|
|
|
|
|
class ModelManager:
|
|
"""Handles model downloading and preparation for quantisation.
|
|
|
|
Manages both GGUF repository downloads and HuggingFace model conversions,
|
|
providing unified interface for model acquisition and preparation.
|
|
"""
|
|
|
|
def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
|
|
"""Initialise model manager with storage and environment configuration.
|
|
|
|
Sets up model storage directory and links to environment manager for
|
|
conversion script access and llama.cpp tool discovery.
|
|
"""
|
|
self.models_dir = models_dir
|
|
self.environment_manager = environment_manager
|
|
self.fs = FilesystemService()
|
|
|
|
def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
|
|
"""Prepare model for quantisation and return F16 model path.
|
|
|
|
Handles both GGUF repository downloads and regular HuggingFace model
|
|
conversion workflows with automatic format detection.
|
|
|
|
Returns:
|
|
Path to F16 GGUF model ready for quantisation.
|
|
"""
|
|
model_dir = self.models_dir / model_source.model_name
|
|
|
|
if model_source.is_gguf_repo:
|
|
return self._handle_gguf_repo(model_source, model_dir)
|
|
return self._handle_regular_repo(model_source, model_dir, llama_env)
|
|
|
|
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
|
|
"""Handle GGUF repository download with pattern matching.
|
|
|
|
Downloads GGUF files matching specified patterns, prioritising
|
|
multi-part files and F16 variants.
|
|
|
|
Returns:
|
|
Path to downloaded or existing GGUF file.
|
|
"""
|
|
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
|
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
|
|
|
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
|
|
|
if f16_model.exists():
|
|
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
|
return f16_model
|
|
|
|
# Check for existing GGUF files
|
|
model_dir.mkdir(parents=True, exist_ok=True)
|
|
existing_gguf = self.fs.find_gguf_files(model_dir)
|
|
|
|
if existing_gguf:
|
|
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
|
|
return existing_gguf[0]
|
|
|
|
# Download with patterns
|
|
downloaded_file = self._download_gguf_with_patterns(
|
|
model_source.source_model, model_source.gguf_file_pattern, model_dir
|
|
)
|
|
|
|
if downloaded_file:
|
|
# Handle multi-part files
|
|
if "00001-of-" in downloaded_file.name:
|
|
return downloaded_file
|
|
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
|
|
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
|
|
"-00003-of-", "-00001-of-"
|
|
)
|
|
first_part = downloaded_file.parent / base_name
|
|
if first_part.exists():
|
|
logger.info(f"🔄 Using first part: {first_part.name}")
|
|
return first_part
|
|
|
|
# Rename single file to standard name
|
|
downloaded_file.rename(f16_model)
|
|
return f16_model
|
|
|
|
# Fallback to regular conversion
|
|
logger.info("💡 Falling back to downloading full repository and converting...")
|
|
return self._handle_regular_repo(
|
|
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
|
|
model_dir,
|
|
None,
|
|
)
|
|
|
|
def _download_gguf_with_patterns(
|
|
self, source_model: str, pattern: str | None, model_dir: Path
|
|
) -> Path | None:
|
|
"""Download GGUF file using various pattern strategies.
|
|
|
|
Tries multiple pattern variations to find and download appropriate
|
|
GGUF files, handling timeouts and temporary directories.
|
|
|
|
Returns:
|
|
Path to downloaded file, or None if all patterns fail.
|
|
"""
|
|
if pattern:
|
|
patterns = [
|
|
f"*{pattern}*",
|
|
f"*{pattern.lower()}*",
|
|
f"*{pattern.upper()}*",
|
|
"*f16*",
|
|
"*F16*",
|
|
"*fp16*",
|
|
]
|
|
else:
|
|
patterns = ["*f16*", "*F16*", "*fp16*"]
|
|
|
|
temp_dir = model_dir / "gguf_temp"
|
|
|
|
for search_pattern in patterns:
|
|
logger.info(f"🔍 Trying pattern: {search_pattern}")
|
|
temp_dir.mkdir(exist_ok=True)
|
|
|
|
try:
|
|
subprocess.run(
|
|
[
|
|
"timeout",
|
|
"300",
|
|
"huggingface-cli",
|
|
"download",
|
|
source_model,
|
|
"--include",
|
|
search_pattern,
|
|
"--local-dir",
|
|
str(temp_dir),
|
|
],
|
|
check=True,
|
|
capture_output=True,
|
|
)
|
|
|
|
# Find downloaded GGUF files
|
|
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
|
|
if gguf_files:
|
|
found_file = gguf_files[0]
|
|
logger.info(f"✅ Found GGUF file: {found_file.name}")
|
|
|
|
# Move to parent directory
|
|
final_path = model_dir / found_file.name
|
|
shutil.move(str(found_file), str(final_path))
|
|
shutil.rmtree(temp_dir)
|
|
return final_path
|
|
|
|
except subprocess.CalledProcessError:
|
|
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
|
|
continue
|
|
finally:
|
|
if temp_dir.exists():
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
return None
|
|
|
|
def _handle_regular_repo(
|
|
self,
|
|
model_source: ModelSource,
|
|
model_dir: Path,
|
|
llama_env: LlamaCppEnvironment | None,
|
|
) -> Path:
|
|
"""Handle regular HuggingFace repository conversion.
|
|
|
|
Downloads full model repository and converts to F16 GGUF format
|
|
using llama.cpp conversion scripts.
|
|
|
|
Returns:
|
|
Path to converted F16 GGUF model.
|
|
"""
|
|
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
|
|
|
|
if not model_dir.exists():
|
|
subprocess.run(
|
|
[
|
|
"huggingface-cli",
|
|
"download",
|
|
model_source.source_model,
|
|
"--local-dir",
|
|
str(model_dir),
|
|
],
|
|
check=True,
|
|
)
|
|
else:
|
|
logger.info("✅ Model already downloaded")
|
|
|
|
logger.info("🔄 Converting to GGUF F16 format...")
|
|
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
|
|
|
if not f16_model.exists():
|
|
if not llama_env:
|
|
llama_env = self.environment_manager.setup()
|
|
|
|
# Ensure conversion script is available
|
|
if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
|
|
logger.info("Getting conversion script from llama.cpp repository...")
|
|
llama_env = self.environment_manager.setup_repository()
|
|
|
|
subprocess.run(
|
|
[
|
|
*llama_env.convert_script.split(),
|
|
str(model_dir),
|
|
"--outtype",
|
|
"f16",
|
|
"--outfile",
|
|
str(f16_model),
|
|
],
|
|
check=True,
|
|
)
|
|
else:
|
|
logger.info("✅ F16 model already exists")
|
|
|
|
return f16_model
|
|
|
|
|
|
class HuggingFaceUploader:
|
|
"""Handles uploading models and documentation to HuggingFace.
|
|
|
|
Provides methods for repository creation, file uploads, and README
|
|
updates with proper error handling and retry logic.
|
|
"""
|
|
|
|
@staticmethod
|
|
def get_username() -> str:
|
|
"""Get authenticated HuggingFace username.
|
|
|
|
Returns:
|
|
HuggingFace username from CLI authentication.
|
|
|
|
Raises:
|
|
RuntimeError: If not authenticated.
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
["huggingface-cli", "whoami"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
return result.stdout.strip()
|
|
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
|
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
|
raise RuntimeError(msg) from err
|
|
|
|
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
|
|
"""Upload or update README file to repository.
|
|
|
|
Creates repository if needed, handles existing repository updates.
|
|
"""
|
|
logger.info("Uploading README...")
|
|
try:
|
|
subprocess.run(
|
|
[
|
|
"huggingface-cli",
|
|
"upload",
|
|
output_repo,
|
|
str(readme_path),
|
|
"README.md",
|
|
"--create",
|
|
],
|
|
check=True,
|
|
capture_output=True,
|
|
)
|
|
logger.info("README uploaded")
|
|
except subprocess.CalledProcessError:
|
|
# Repository exists, update without --create
|
|
subprocess.run(
|
|
[
|
|
"huggingface-cli",
|
|
"upload",
|
|
output_repo,
|
|
str(readme_path),
|
|
"README.md",
|
|
],
|
|
check=True,
|
|
)
|
|
logger.info("README updated")
|
|
|
|
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
|
|
"""Upload model file to repository.
|
|
|
|
Uploads GGUF model file to specified repository path.
|
|
"""
|
|
logger.info(f"Uploading {model_path.name}...")
|
|
subprocess.run(
|
|
[
|
|
"huggingface-cli",
|
|
"upload",
|
|
output_repo,
|
|
str(model_path),
|
|
model_path.name,
|
|
],
|
|
check=True,
|
|
)
|
|
logger.info(f"{model_path.name} uploaded")
|