llm-gguf-tools/helpers/services/quantisation.py

675 lines
25 KiB
Python

"""Quantisation operations service.
Provides modular quantisation engine, model management, and upload capabilities
for GGUF model processing. Consolidates quantisation logic from various tools
into reusable components following SOLID principles.
"""
from __future__ import annotations
import shutil
import subprocess
import tempfile
import traceback
from pathlib import Path
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.filesystem import FilesystemService
from helpers.services.gguf import GGUFConverter
from helpers.services.llama_python import LlamaCppPythonAPI
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Uses llama-cpp-python API for direct quantisation without subprocess overhead.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
self.python_api = LlamaCppPythonAPI()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation using Python API. Since llama-cpp-python is a
required dependency, we can rely on it being available.
Returns:
QuantisationResult with success status and file information.
"""
logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
logger.debug(f"DEBUG: Output path: {output_path}")
# Check input file exists and is readable
if not context.f16_model_path.exists():
error_msg = f"Input model file does not exist: {context.f16_model_path}"
logger.error(f"{error_msg}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=error_msg,
)
# Check if we have enough disk space (rough estimate)
try:
input_size = context.f16_model_path.stat().st_size
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
# This is a rough check - actual available space calculation is more complex
logger.debug(f"DEBUG: Output directory: {output_path.parent}")
except Exception as e:
logger.warning(f"⚠️ Could not check disk space: {e}")
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.debug(f"DEBUG: Source: {context.f16_model_path}")
logger.debug(f"DEBUG: Target: {output_path}")
logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
try:
# Use Python API for quantisation
logger.info("🐍 Using Python API for quantisation...")
logger.debug("DEBUG: Calling python_api.quantise_model...")
success = self.python_api.quantise_model(
context.f16_model_path, output_path, context.config, context.imatrix_path
)
logger.debug(f"DEBUG: Python API returned: {success}")
if success:
logger.debug("DEBUG: Quantisation successful, creating success result")
return self._create_success_result(context.config.name, output_path, "Python API")
logger.error(f"{context.config.name} quantisation failed")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="Quantisation failed via Python API",
)
except Exception as e:
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message=f"Exception during quantisation: {e!s}",
)
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)
class ModelManager:
"""Handles model downloading and preparation for quantisation.
Manages both GGUF repository downloads and HuggingFace model conversions,
providing unified interface for model acquisition and preparation.
"""
def __init__(self, models_dir: Path) -> None:
"""Initialise model manager with storage configuration.
Sets up model storage directory for model downloads and conversions.
"""
self.models_dir = models_dir
self.fs = FilesystemService()
def prepare_model(self, model_source: ModelSource) -> Path:
"""Prepare model for quantisation and return F16 model path.
Handles both GGUF repository downloads and regular HuggingFace model
conversion workflows with automatic format detection.
Returns:
Path to F16 GGUF model ready for quantisation.
"""
model_dir = self.models_dir / model_source.model_name
if model_source.is_gguf_repo:
return self._handle_gguf_repo(model_source, model_dir)
return self._handle_regular_repo(model_source, model_dir)
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Handle GGUF repository download with pattern matching.
Downloads GGUF files matching specified patterns, prioritising
multi-part files and F16 variants.
Returns:
Path to downloaded or existing GGUF file.
"""
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
return f16_model
# Check for existing GGUF files
model_dir.mkdir(parents=True, exist_ok=True)
existing_gguf = self.fs.find_gguf_files(model_dir)
if existing_gguf:
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
return existing_gguf[0]
# Download with patterns
downloaded_file = self._download_gguf_with_patterns(
model_source.source_model, model_source.gguf_file_pattern, model_dir
)
if downloaded_file:
# Handle multi-part files
if "00001-of-" in downloaded_file.name:
return downloaded_file
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
"-00003-of-", "-00001-of-"
)
first_part = downloaded_file.parent / base_name
if first_part.exists():
logger.info(f"🔄 Using first part: {first_part.name}")
return first_part
# Rename single file to standard name
downloaded_file.rename(f16_model)
return f16_model
# Fallback to regular conversion
logger.info("💡 Falling back to downloading full repository and converting...")
return self._handle_regular_repo(
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
model_dir,
)
def _download_gguf_with_patterns(
self, source_model: str, pattern: str | None, model_dir: Path
) -> Path | None:
"""Download GGUF file using various pattern strategies.
Tries multiple pattern variations to find and download appropriate
GGUF files, handling timeouts and temporary directories.
Returns:
Path to downloaded file, or None if all patterns fail.
"""
if pattern:
patterns = [
f"*{pattern}*",
f"*{pattern.lower()}*",
f"*{pattern.upper()}*",
"*f16*",
"*F16*",
"*fp16*",
]
else:
patterns = ["*f16*", "*F16*", "*fp16*"]
temp_dir = model_dir / "gguf_temp"
for search_pattern in patterns:
logger.info(f"🔍 Trying pattern: {search_pattern}")
temp_dir.mkdir(exist_ok=True)
try:
logger.debug(
f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
)
result = subprocess.run(
[
"timeout",
"300",
"huggingface-cli",
"download",
source_model,
"--include",
search_pattern,
"--local-dir",
str(temp_dir),
],
check=True,
capture_output=True,
text=True,
)
logger.debug(
f"DEBUG: Download command completed with return code {result.returncode}"
)
# Find downloaded GGUF files
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
if gguf_files:
found_file = gguf_files[0]
logger.info(f"✅ Found GGUF file: {found_file.name}")
# Move to parent directory
final_path = model_dir / found_file.name
shutil.move(str(found_file), str(final_path))
shutil.rmtree(temp_dir)
return final_path
except subprocess.CalledProcessError as e:
logger.debug(
f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
)
if e.stderr:
logger.debug(f"DEBUG: stderr: {e.stderr}")
if e.stdout:
logger.debug(f"DEBUG: stdout: {e.stdout}")
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
continue
except Exception as e:
logger.error(f"❌ Unexpected error during download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
continue
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return None
def _handle_regular_repo(
self,
model_source: ModelSource,
model_dir: Path,
) -> Path:
"""Handle regular HuggingFace repository conversion.
Downloads full model repository and converts to F16 GGUF format
using our native Python-based GGUFConverter for SafeTensors models.
Returns:
Path to converted F16 GGUF model.
"""
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
# Download model if needed
if not model_dir.exists():
self._download_repository(model_source.source_model, model_dir)
else:
logger.info("✅ Model already downloaded")
# Convert to GGUF
return self._convert_to_gguf(model_source, model_dir)
def _download_repository(self, source_model: str, model_dir: Path) -> None:
"""Download HuggingFace repository.
Args:
source_model: HuggingFace model identifier.
model_dir: Local directory for download.
Raises:
RuntimeError: If download fails.
"""
try:
logger.debug(f"DEBUG: Downloading full repository: {source_model}")
result = subprocess.run(
[
"huggingface-cli",
"download",
source_model,
"--local-dir",
str(model_dir),
],
check=True,
capture_output=True,
text=True,
)
logger.debug(
f"DEBUG: Repository download completed with return code {result.returncode}"
)
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to download repository {source_model}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"Repository download failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during repository download: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Convert model to GGUF F16 format.
Args:
model_source: Model source information.
model_dir: Directory containing model files.
Returns:
Path to F16 GGUF model.
Raises:
RuntimeError: If conversion fails.
"""
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info("✅ F16 model already exists")
return f16_model
# Check for SafeTensors files
safetensor_files = list(model_dir.glob("*.safetensors"))
if not safetensor_files:
logger.error("❌ Model format not supported")
logger.info("💡 This tool supports GGUF and SafeTensors formats")
msg = "Model must be in GGUF or SafeTensors format"
raise RuntimeError(msg)
logger.info("🐍 Using native Python GGUFConverter...")
logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
# Load model configuration
config_parser = ConfigParser()
model_config = config_parser.load_model_config(model_dir)
# Get architecture mapping
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
arch = config_parser.get_architecture_mapping(arch_name)
if arch != arch_name:
logger.info(f"📝 Architecture mapping: {arch_name}{arch}")
# Convert using GGUFConverter
tensor_mapper = TensorMapper()
success = GGUFConverter.convert_safetensors(
model_dir, f16_model, model_config, arch, tensor_mapper
)
if not success:
logger.error("❌ Native Python conversion failed")
msg = "Failed to convert SafeTensors model to GGUF"
raise RuntimeError(msg)
logger.info("✅ Native Python conversion successful")
return f16_model
class HuggingFaceUploader:
"""Handles uploading models and documentation to HuggingFace.
Provides methods for repository creation, file uploads, and README
updates with proper error handling and retry logic.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Returns:
HuggingFace username from CLI authentication.
Raises:
RuntimeError: If not authenticated.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
Raises:
RuntimeError: If the README upload fails.
"""
logger.info("Uploading README...")
# First ensure the repository exists
self._ensure_repo_exists(output_repo)
# Upload without --create flag to avoid PR creation
try:
logger.debug(f"DEBUG: Uploading README to {output_repo}")
result = subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
"--commit-message",
"Update README.md",
],
check=True,
capture_output=True,
text=True,
)
logger.debug(f"DEBUG: README upload completed with return code {result.returncode}")
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to upload README to {output_repo}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"README upload failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during README upload: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
logger.info("README uploaded")
def _ensure_repo_exists(self, repo_id: str) -> None:
"""Ensure the repository exists, creating it if necessary."""
try:
# Try to create the repo - will fail if it already exists
subprocess.run(
[
"huggingface-cli",
"repo",
"create",
repo_id,
"--type",
"model",
"-y",
],
check=True,
capture_output=True,
text=True,
)
logger.info(f"Created repository: {repo_id}")
except subprocess.CalledProcessError:
# Repository already exists, that's fine
pass
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
"""Upload model file to repository.
Uploads GGUF model file to specified repository path.
Always uses huggingface-cli to ensure proper handling of large files
via HuggingFace's xet backend.
Raises:
RuntimeError: If the model file upload fails.
"""
logger.info(f"Uploading {model_path.name}...")
# Always use huggingface-cli for model files to ensure xet backend is used
try:
logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}")
result = subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(model_path),
model_path.name,
"--revision",
"main", # Explicitly push to main branch
"--commit-message",
f"Add {model_path.name}",
],
check=True,
capture_output=True,
text=True,
)
logger.debug(f"DEBUG: Model upload completed with return code {result.returncode}")
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to upload model file {model_path.name} to {output_repo}")
logger.error(f"Return code: {e.returncode}")
if e.stderr:
logger.error(f"stderr: {e.stderr}")
if e.stdout:
logger.error(f"stdout: {e.stdout}")
msg = f"Model file upload failed: {e}"
raise RuntimeError(msg) from e
except Exception as e:
logger.error(f"❌ Unexpected error during model file upload: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
# Extract and log the URL if present in output
if result.stdout:
for line in result.stdout.splitlines():
if "https://huggingface.co/" in line:
logger.info(f"Upload URL: {line.strip()}")
break
logger.info(f"{model_path.name} uploaded")
def _try_git_upload_file(
self,
repo_id: str,
local_path: Path,
repo_path: str,
*,
create_repo: bool = False,
) -> bool:
"""Try to upload file using git directly to avoid PR creation.
Returns:
bool: True if upload successful, False if should fallback to CLI.
"""
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
repo_url = f"https://huggingface.co/{repo_id}"
# Clone repository
logger.info(f"Cloning {repo_url}...")
result = subprocess.run(
["git", "clone", repo_url, str(temp_path / "repo")],
check=False,
capture_output=True,
text=True,
)
if result.returncode != 0:
if create_repo:
# Repository doesn't exist, let huggingface-cli handle creation
return False
logger.warning(f"Clone failed: {result.stderr}")
return False
repo_dir = temp_path / "repo"
target_file = repo_dir / repo_path
# Ensure target directory exists
target_file.parent.mkdir(parents=True, exist_ok=True)
# Copy file
shutil.copy2(local_path, target_file)
# Check if there are any changes
status_result = subprocess.run(
["git", "status", "--porcelain"],
cwd=repo_dir,
capture_output=True,
text=True,
check=True,
)
if not status_result.stdout.strip():
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
return True # File is already up-to-date, no need to push
# Git add, commit, push
subprocess.run(
["git", "add", repo_path],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "commit", "-m", f"Update {repo_path}"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
subprocess.run(
["git", "push"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
)
return True
except subprocess.CalledProcessError as e:
logger.warning(f"Git upload failed: {e}")
return False
except Exception as e:
logger.warning(f"Git upload error: {e}")
return False