Initial commit

This commit is contained in:
Tom Foster 2025-08-07 18:29:12 +01:00
commit ef7df1a8c3
28 changed files with 6829 additions and 0 deletions

View file

@ -0,0 +1,20 @@
"""Service layer for llm-gguf-tools.
Provides high-level service interfaces for interacting with external systems
including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
spelling conventions throughout.
"""
from __future__ import annotations
from helpers.services.filesystem import FilesystemService
from helpers.services.huggingface import HuggingFaceService, ReadmeGenerator
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
__all__ = [
"EnvironmentManager",
"FilesystemService",
"HuggingFaceService",
"IMatrixGenerator",
"ReadmeGenerator",
]

View file

@ -0,0 +1,174 @@
"""Filesystem operations service.
Provides unified filesystem operations including file discovery, size
calculation, and path management. Consolidates common filesystem patterns
used across quantisation and conversion workflows.
"""
from __future__ import annotations
import json
import subprocess
from pathlib import Path
from typing import Any
from helpers.logger import logger
BYTES_PER_UNIT = 1024.0
class FilesystemService:
"""Handles filesystem operations with consistent error handling.
Provides methods for file discovery, size formatting, and JSON loading
with proper error handling and logging. Ensures consistent behaviour
across different tools and workflows.
"""
@staticmethod
def get_file_size(file_path: Path) -> str:
"""Get human-readable file size using system utilities.
Attempts to use `du -h` for human-readable output, falling back to
Python calculation if the system command fails. Provides consistent
size formatting across the toolset.
Returns:
Human-readable file size string (e.g., "1.5G", "750M").
"""
try:
result = subprocess.run(
["du", "-h", str(file_path)], capture_output=True, text=True, check=True
)
return result.stdout.split()[0]
except (subprocess.CalledProcessError, FileNotFoundError):
# Fallback to Python calculation
try:
size_bytes: float = float(file_path.stat().st_size)
for unit in ["B", "K", "M", "G", "T"]:
if size_bytes < BYTES_PER_UNIT:
return f"{size_bytes:.1f}{unit}"
size_bytes /= BYTES_PER_UNIT
except Exception:
return "Unknown"
else:
return f"{size_bytes:.1f}P"
@staticmethod
def load_json_config(config_path: Path) -> dict[str, Any]:
"""Load and parse JSON configuration file.
Provides consistent JSON loading with proper error handling and
encoding specification. Used for loading model configurations,
tokeniser settings, and other JSON-based metadata.
Returns:
Parsed JSON content as dictionary.
Raises:
FileNotFoundError: If config file doesn't exist.
"""
if not config_path.exists():
msg = f"Configuration file not found: {config_path}"
raise FileNotFoundError(msg)
with Path(config_path).open(encoding="utf-8") as f:
return json.load(f)
@staticmethod
def find_safetensor_files(model_path: Path) -> list[Path]:
"""Find all SafeTensor files in model directory using priority search.
Searches for tensor files in order of preference: single model.safetensors,
sharded model-*-of-*.safetensors files, then any *.safetensors files. This
approach handles both single-file and multi-shard model distributions whilst
ensuring predictable file ordering for conversion consistency.
Returns:
List of SafeTensor file paths in priority order.
Raises:
FileNotFoundError: If no SafeTensor files are found.
"""
# Check for single file
single_file = model_path / "model.safetensors"
if single_file.exists():
return [single_file]
# Check for sharded files
pattern = "model-*-of-*.safetensors"
sharded_files = sorted(model_path.glob(pattern))
if sharded_files:
return sharded_files
# Check for any safetensor files
any_files = sorted(model_path.glob("*.safetensors"))
if any_files:
return any_files
msg = f"No SafeTensor files found in {model_path}"
raise FileNotFoundError(msg)
@staticmethod
def find_gguf_files(model_path: Path, pattern: str | None = None) -> list[Path]:
"""Find GGUF files in directory, optionally filtered by pattern.
Searches for GGUF files with optional pattern matching. Prioritises
multi-part files (00001-of-*) over single files for proper handling
of large models split across multiple files.
Returns:
List of GGUF file paths, sorted with multi-part files first.
"""
if pattern:
gguf_files = list(model_path.glob(f"*{pattern}*.gguf"))
else:
gguf_files = list(model_path.glob("*.gguf"))
# Sort to prioritise 00001-of-* files
gguf_files.sort(
key=lambda x: (
"00001-of-" not in x.name, # False sorts before True
x.name,
)
)
return gguf_files
@staticmethod
def ensure_directory(path: Path) -> Path:
"""Ensure directory exists, creating if necessary.
Creates directory and all parent directories if they don't exist.
Returns the path for method chaining convenience.
Returns:
The directory path.
"""
path.mkdir(parents=True, exist_ok=True)
return path
@staticmethod
def cleanup_directory(path: Path, pattern: str = "*") -> int:
"""Remove files matching pattern from directory.
Safely removes files matching the specified glob pattern. Returns
count of files removed for logging purposes.
Returns:
Number of files removed.
"""
if not path.exists():
return 0
files_removed = 0
for file_path in path.glob(pattern):
if file_path.is_file():
try:
file_path.unlink()
files_removed += 1
except Exception as e:
logger.warning(f"Failed to remove {file_path}: {e}")
return files_removed

210
helpers/services/gguf.py Normal file
View file

@ -0,0 +1,210 @@
"""GGUF file operations service.
Provides unified interface for creating, writing, and manipulating GGUF files.
Consolidates GGUF-specific operations from conversion and quantisation workflows.
Uses UK English spelling conventions throughout.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Any
import gguf
import torch
from safetensors import safe_open
from helpers.logger import logger
from helpers.services.filesystem import FilesystemService
from helpers.utils.config_parser import ConfigParser
if TYPE_CHECKING:
from pathlib import Path
import numpy as np
from helpers.models.conversion import ModelConfig
class GGUFWriter:
"""Manages GGUF file creation and metadata writing.
Provides high-level interface for GGUF file operations including metadata
configuration, tensor addition, and tokeniser integration. Encapsulates
low-level GGUF library interactions for consistent error handling.
"""
def __init__(self, output_path: Path, architecture: str) -> None:
"""Initialise GGUF writer with output path and architecture.
Creates the underlying GGUF writer instance and prepares for metadata
and tensor addition. Sets up the file structure for the specified
model architecture.
"""
self.output_path = output_path
self.architecture = architecture
self.writer = gguf.GGUFWriter(str(output_path), architecture)
logger.info(f"Created GGUF writer for {architecture} architecture")
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
"""Add comprehensive metadata from model configuration.
Writes general model information, architectural parameters, and
quantisation settings to the GGUF file header. Handles both standard
and vision model configurations with appropriate parameter mapping.
"""
# General metadata
self.writer.add_name(model_name)
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
# Model parameters from config
params = model_config.to_gguf_params()
self.writer.add_context_length(params.context_length)
self.writer.add_embedding_length(params.embedding_length)
self.writer.add_block_count(params.block_count)
self.writer.add_feed_forward_length(params.feed_forward_length)
self.writer.add_head_count(params.attention_head_count)
self.writer.add_head_count_kv(params.attention_head_count_kv)
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
self.writer.add_rope_freq_base(params.rope_freq_base)
self.writer.add_rope_dimension_count(params.rope_dimension_count)
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
def add_vision_metadata(self, vision_config: Any) -> None:
"""Add vision model parameters to GGUF metadata.
Configures vision-specific parameters for multimodal models including
embedding dimensions, attention heads, and spatial processing settings.
"""
if not vision_config:
return
logger.info("Adding vision model parameters...")
self.writer.add_vision_embedding_length(vision_config.hidden_size)
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
self.writer.add_vision_head_count(vision_config.num_attention_heads)
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
self.writer.add_vision_patch_size(vision_config.patch_size)
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
"""Add tokeniser metadata to GGUF file.
Writes special token IDs and tokeniser model type to enable proper
text processing during inference. Uses sensible defaults for missing
configuration values.
"""
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
logger.info("Added tokeniser configuration")
def add_tensor(self, name: str, data: np.ndarray) -> None:
"""Add a tensor to the GGUF file.
Writes tensor data with the specified name to the file. Handles
data type conversions and validates tensor shapes.
"""
self.writer.add_tensor(name, data)
def finalise(self) -> None:
"""Write all data to file and close writer.
Completes the GGUF file creation by writing headers, key-value data,
and tensor data in the correct order. Ensures proper file closure.
"""
logger.info(f"Writing GGUF file to {self.output_path}")
self.writer.write_header_to_file()
self.writer.write_kv_data_to_file()
self.writer.write_tensors_to_file()
self.writer.close()
logger.info("GGUF file written successfully")
class GGUFConverter:
"""High-level GGUF conversion orchestrator.
Coordinates the complete conversion workflow from source models to GGUF
format, managing metadata extraction, tensor mapping, and file writing.
"""
@staticmethod
def convert_safetensors(
model_path: Path,
output_path: Path,
model_config: ModelConfig,
architecture: str,
tensor_mapper: Any,
) -> bool:
"""Convert SafeTensors model to GGUF format.
Orchestrates the conversion process including metadata setup, tensor
loading with BFloat16 support, name mapping, and tokeniser integration.
Returns:
True if conversion successful, False otherwise.
"""
logger.info(f"Converting {model_path.name} to GGUF...")
# Create writer
writer_wrapper = GGUFWriter(output_path, architecture)
# Add metadata
writer_wrapper.add_metadata(model_config, model_path.name)
# Add vision metadata if present
if model_config.vision_config:
writer_wrapper.add_vision_metadata(model_config.vision_config)
# Load and add tensors
fs = FilesystemService()
tensor_files = fs.find_safetensor_files(model_path)
logger.info(f"Found {len(tensor_files)} tensor file(s)")
tensor_count = 0
for tensor_file in tensor_files:
logger.info(f"Loading {tensor_file.name}...")
with safe_open(tensor_file, framework="pt") as f:
for tensor_name in f:
tensor_data = f.get_tensor(tensor_name)
# Convert BFloat16 to Float32
if hasattr(tensor_data, "numpy"):
if torch and tensor_data.dtype == torch.bfloat16:
tensor_data = tensor_data.float()
tensor_data = tensor_data.numpy()
# Map tensor name
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
if gguf_name:
writer_wrapper.add_tensor(gguf_name, tensor_data)
tensor_count += 1
if tensor_count % 100 == 0:
logger.info(f" Processed {tensor_count} tensors...")
logger.info(f"Total tensors processed: {tensor_count}")
# Add tokeniser
try:
tok_config = ConfigParser.load_tokeniser_config(model_path)
writer_wrapper.add_tokeniser(tok_config)
logger.info("Tokeniser added")
except Exception as e:
logger.warning(f"Could not add tokeniser: {e}")
# Finalise file
writer_wrapper.finalise()
file_size = fs.get_file_size(output_path)
logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
return True

View file

@ -0,0 +1,454 @@
"""HuggingFace operations service.
Handles all interactions with HuggingFace including model downloads,
uploads, README generation, and repository management. Uses UK English
spelling conventions throughout.
"""
from __future__ import annotations
import re
import subprocess
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.models.quantisation import QuantisationType
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource, QuantisationResult
class HuggingFaceService:
"""Manages HuggingFace repository operations.
Provides methods for downloading models, uploading files, and managing
repositories. Handles authentication, error recovery, and progress tracking
for robust interaction with HuggingFace services.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Retrieves the current user's HuggingFace username using the CLI.
Requires prior authentication via `huggingface-cli login`.
Returns:
HuggingFace username.
Raises:
RuntimeError: If not authenticated or CLI not available.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
@staticmethod
def download_model(
model_name: str, output_dir: Path, include_pattern: str | None = None
) -> None:
"""Download model from HuggingFace.
Downloads a complete model or specific files matching a pattern.
Creates the output directory if it doesn't exist. Supports filtered
downloads for efficient bandwidth usage when only certain files are needed.
"""
logger.info(f"Downloading {model_name} to {output_dir}")
cmd = [
"huggingface-cli",
"download",
model_name,
"--local-dir",
str(output_dir),
]
if include_pattern:
cmd.extend(["--include", include_pattern])
subprocess.run(cmd, check=True)
logger.info("Download complete")
@staticmethod
def upload_file(
repo_id: str,
local_path: Path,
repo_path: str | None = None,
create_repo: bool = False,
) -> None:
"""Upload a file to HuggingFace repository.
Uploads a single file to the specified repository path. Can create
the repository if it doesn't exist. Handles repository creation conflicts
gracefully by retrying without the create flag when needed.
Raises:
CalledProcessError: If upload fails.
"""
repo_path = repo_path or local_path.name
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
cmd = [
"huggingface-cli",
"upload",
repo_id,
str(local_path),
repo_path,
]
if create_repo:
cmd.append("--create")
try:
subprocess.run(cmd, check=True, capture_output=True)
logger.info(f"Uploaded {repo_path}")
except subprocess.CalledProcessError:
if create_repo:
# Repository might already exist, retry without --create
cmd = cmd[:-1] # Remove --create flag
subprocess.run(cmd, check=True)
logger.info(f"Updated {repo_path}")
else:
raise
class ReadmeGenerator:
"""Generates README files for quantised models.
Creates comprehensive README documentation including model cards,
quantisation details, and status tracking. Supports both initial
planning documentation and final result summaries.
"""
def generate(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str | None = None,
) -> Path:
"""Generate README file for quantised model repository.
Creates a comprehensive README with frontmatter, quantisation table,
and original model information. Handles status tracking for planned,
processing, and completed quantisations.
Returns:
Path to generated README file.
"""
logger.info("Creating model card...")
model_dir = models_dir / model_source.model_name
readme_path = model_dir / "README.md"
# Get original README content
original_content = self._get_original_readme(model_source, model_dir)
# Generate new README
readme_content = self._generate_readme_content(
model_source, results, original_content, output_repo
)
readme_path.write_text(readme_content)
return readme_path
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
"""Extract original README and metadata.
Downloads or reads the original model's README for inclusion in the
quantised model documentation. Parses YAML frontmatter if present.
Returns:
Dictionary with readme content, licence, tags, and frontmatter.
"""
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
# Try local file first
readme_path = model_dir / "README.md"
if readme_path.exists():
content["readme"] = readme_path.read_text(encoding="utf-8")
logger.info(f"Found original README ({len(content['readme'])} characters)")
else:
# Download separately
content = self._download_readme(model_source)
# Parse frontmatter if present
if content["readme"].startswith("---\n"):
content = self._parse_frontmatter(content["readme"])
return content
def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
"""Download README from HuggingFace repository.
Attempts to download just the README.md file from the source repository
for efficient documentation extraction.
Returns:
Dictionary with readme content and default metadata.
"""
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
with tempfile.TemporaryDirectory() as temp_dir:
try:
logger.info(f"Downloading README from {model_source.source_model}...")
subprocess.run(
[
"huggingface-cli",
"download",
model_source.source_model,
"--include",
"README.md",
"--local-dir",
temp_dir,
],
check=True,
capture_output=True,
)
readme_path = Path(temp_dir) / "README.md"
if readme_path.exists():
content["readme"] = readme_path.read_text(encoding="utf-8")
logger.info(f"Downloaded README ({len(content['readme'])} characters)")
except subprocess.CalledProcessError as e:
logger.warning(f"Failed to download README: {e}")
return content
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
"""Parse YAML frontmatter from README.
Extracts metadata from YAML frontmatter including licence, tags,
and other model card fields.
Returns:
Dictionary with separated content and metadata.
"""
lines = readme_text.split("\n")
if lines[0] != "---":
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter_end = -1
for i, line in enumerate(lines[1:], 1):
if line == "---":
frontmatter_end = i
break
if frontmatter_end == -1:
return {
"readme": readme_text,
"licence": "apache-2.0",
"tags": "",
"frontmatter": "",
}
frontmatter = "\n".join(lines[1:frontmatter_end])
content = "\n".join(lines[frontmatter_end + 1 :])
# Extract licence
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
# Extract tags
tags = []
in_tags = False
for line in frontmatter.split("\n"):
if line.startswith("tags:"):
in_tags = True
continue
if in_tags:
if line.startswith("- "):
tags.append(line[2:].strip())
elif line and not line.startswith(" "):
break
return {
"readme": content,
"licence": licence_val,
"tags": ",".join(tags),
"frontmatter": frontmatter,
}
def _generate_readme_content(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
original_content: dict[str, str],
output_repo: str | None = None,
) -> str:
"""Generate complete README content with quantisation details.
Creates the full README including YAML frontmatter, quantisation status
table, and original model information.
Returns:
Complete README markdown content.
"""
# Build tags
our_tags = [
"quantised",
"gguf",
"q4_k_m",
"q4_k_l",
"q4_k_xl",
"q4_k_xxl",
"bartowski-method",
]
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
all_tags = sorted(set(our_tags + original_tags))
# Build frontmatter
frontmatter = f"""---
license: {original_content["licence"]}
library_name: gguf
base_model: {model_source.source_model}
tags:
"""
for tag in all_tags:
if tag.strip():
frontmatter += f"- {tag.strip()}\n"
frontmatter += "---\n\n"
# Build main content
hf_url = f"https://huggingface.co/{model_source.source_model}"
content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's method.
| Quantisation | Embeddings/Output | Attention | Feed-Forward | Status |
|--------------|-------------------|-----------|--------------|--------|
"""
# Add results table
for quant_type in [
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
QuantisationType.Q4_K_XL,
QuantisationType.Q4_K_XXL,
]:
result = results.get(quant_type)
if not result:
result = type("Result", (), {"status": "planned", "success": False})()
layers = self._get_layers_config(quant_type)
status = self._format_status(result, model_source, quant_type, output_repo)
content += (
f"| {quant_type.value} | {layers['embeddings']} | "
f"{layers['attention']} | {layers['ffn']} | {status} |\n"
)
content += "\n---\n\n"
# Add original content
if original_content["readme"]:
content += "# Original Model Information\n\n" + original_content["readme"]
else:
content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model}).\n"
return frontmatter + content
def _get_layers_config(self, quant_type: QuantisationType) -> dict[str, str]:
"""Get layer configuration for quantisation type.
Returns layer precision specifications for the quantisation table.
Returns:
Dictionary with embeddings, attention, and ffn precision labels.
"""
configs = {
QuantisationType.Q4_K_M: {
"embeddings": "Q4_K_M",
"attention": "Q4_K_M",
"ffn": "Q4_K_M",
},
QuantisationType.Q4_K_L: {"embeddings": "Q6_K", "attention": "Q6_K", "ffn": "Q4_K_M"},
QuantisationType.Q4_K_XL: {"embeddings": "Q8_0", "attention": "Q6_K", "ffn": "Q4_K_M"},
QuantisationType.Q4_K_XXL: {"embeddings": "Q8_0", "attention": "Q8_0", "ffn": "Q4_K_M"},
}
return configs.get(
quant_type, {"embeddings": "Unknown", "attention": "Unknown", "ffn": "Unknown"}
)
def _format_status(
self,
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format status indicator for README table.
Creates appropriate status indicator based on quantisation state
including progress indicators, file sizes, and download links.
Returns:
Formatted status string for table cell.
"""
status_map = {
"planned": "⏳ Planned",
"processing": "🔄 Processing...",
"uploading": "⬆️ Uploading...",
"failed": "❌ Failed",
}
if hasattr(result, "status") and result.status in status_map:
base_status = status_map[result.status]
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
return f"{base_status} ({result.file_size})"
if result.status == "completed" or (hasattr(result, "success") and result.success):
return self._format_success_status(result, model_source, quant_type, output_repo)
return base_status
# Legacy support
if hasattr(result, "success") and result.success:
return self._format_success_status(result, model_source, quant_type, output_repo)
return "❌ Failed"
def _format_success_status(
self,
result: QuantisationResult,
model_source: ModelSource,
quant_type: QuantisationType,
output_repo: str | None,
) -> str:
"""Format successful quantisation status with download link.
Creates a download link if repository information is available,
otherwise shows file size.
Returns:
Formatted success status string.
"""
if not output_repo:
return (
f"{result.file_size}"
if hasattr(result, "file_size") and result.file_size
else "✅ Available"
)
filename = (
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
)
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
if hasattr(result, "file_size") and result.file_size:
return f"[✅ {result.file_size}]({url})"
return f"[✅ Available]({url})"

View file

@ -0,0 +1,417 @@
"""llama.cpp environment and operations service.
Manages llama.cpp binary discovery, environment setup, and imatrix generation.
Provides consistent interface for interacting with llama.cpp tools across
different installation methods.
"""
from __future__ import annotations
import subprocess
from pathlib import Path
from helpers.logger import logger
from helpers.models.quantisation import LlamaCppEnvironment
from helpers.services.filesystem import FilesystemService
class EnvironmentManager:
"""Manages llama.cpp environment setup and binary discovery.
Handles detection of local binaries, repository setup, and conversion
script location. Provides fallback strategies for different installation
scenarios including local builds and repository-based setups.
"""
def __init__(self, work_dir: Path) -> None:
"""Initialise EnvironmentManager."""
self.work_dir = work_dir
self.llama_cpp_dir = work_dir / "llama.cpp"
self.fs = FilesystemService()
def setup(self) -> LlamaCppEnvironment:
"""Set up llama.cpp environment with automatic detection.
Checks for local llama.cpp binaries first, then falls back to
repository-based setup if needed. Handles conversion script location,
dependency installation, and path resolution.
Returns:
Configured LlamaCppEnvironment instance.
"""
# Check for local binaries first
local_env = self._check_local_binaries()
if local_env:
return local_env
# Setup repository if needed
return self.setup_repository()
def _check_local_binaries(self) -> LlamaCppEnvironment | None:
"""Check for existing llama.cpp binaries in current directory.
Searches for quantise and CLI binaries in the current directory
and standard installation paths. Also locates conversion scripts.
Returns:
LlamaCppEnvironment if binaries found, None otherwise.
"""
quantise_bin = Path("./llama-quantize")
cli_bin = Path("./llama-cli")
if not (quantise_bin.exists() and cli_bin.exists()):
return None
logger.info("Found llama.cpp binaries in current directory")
# Check for conversion script
convert_script = self._find_convert_script()
if convert_script:
logger.info(f"Found conversion script: {convert_script}")
return LlamaCppEnvironment(
quantise_binary=quantise_bin.resolve(),
cli_binary=cli_bin.resolve(),
convert_script=convert_script,
use_repo=False,
)
logger.warning("No conversion script found in current directory")
logger.info("Will use llama.cpp repository method for conversion")
return LlamaCppEnvironment(
quantise_binary=quantise_bin.resolve(),
cli_binary=cli_bin.resolve(),
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
use_repo=True,
)
def _find_convert_script(self) -> str | None:
"""Find conversion script in current directory.
Searches for various naming conventions of the HF to GGUF
conversion script.
Returns:
Command to run conversion script, or None if not found.
"""
scripts = [
"./llama-convert-hf-to-gguf",
"python3 ./convert_hf_to_gguf.py",
"python3 ./convert-hf-to-gguf.py",
]
for script in scripts:
if script.startswith("python3"):
script_path = script.split(" ", 1)[1]
if Path(script_path).exists():
return script
elif Path(script).exists():
return script
return None
def setup_repository(self) -> LlamaCppEnvironment:
"""Setup llama.cpp repository for conversion scripts.
Clones the llama.cpp repository if not present and installs
Python dependencies for model conversion.
Returns:
LlamaCppEnvironment configured with repository paths.
"""
if not self.llama_cpp_dir.exists():
logger.info("Cloning llama.cpp for conversion script...")
subprocess.run(
[
"git",
"clone",
"https://github.com/ggerganov/llama.cpp.git",
str(self.llama_cpp_dir),
],
check=True,
)
# Install Python requirements
logger.info("Installing Python requirements...")
subprocess.run(
[
"pip3",
"install",
"-r",
"requirements.txt",
"--break-system-packages",
"--root-user-action=ignore",
],
cwd=self.llama_cpp_dir,
check=True,
)
# Install additional conversion dependencies
logger.info("Installing additional conversion dependencies...")
subprocess.run(
[
"pip3",
"install",
"transformers",
"sentencepiece",
"protobuf",
"--break-system-packages",
"--root-user-action=ignore",
],
check=True,
)
else:
logger.info("llama.cpp repository already exists")
# Use local binaries but repo conversion script
return LlamaCppEnvironment(
quantise_binary=Path("./llama-quantize").resolve(),
cli_binary=Path("./llama-cli").resolve(),
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
use_repo=False,
)
class IMatrixGenerator:
"""Handles importance matrix generation for quantisation guidance.
Generates or locates importance matrices that guide quantisation
decisions, helping preserve model quality by identifying critical
tensors requiring higher precision.
"""
def __init__(self) -> None:
"""Initialise IMatrixGenerator."""
self.fs = FilesystemService()
def generate_imatrix(
self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
) -> Path | None:
"""Generate importance matrix for quantisation guidance.
Searches for existing imatrix files first, provides interactive
prompts for user-supplied matrices, then generates new matrices
using calibration data if necessary.
Returns:
Path to imatrix file, or None if generation fails.
"""
imatrix_path = model_dir / "imatrix.dat"
# Check for existing imatrix
if imatrix_path.exists():
logger.info(f"Found existing imatrix: {imatrix_path.name}")
return imatrix_path
# Try user-provided imatrix
user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
if user_imatrix:
return user_imatrix
# Generate new imatrix
calibration_file = self._get_calibration_file()
if not calibration_file:
return None
return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
"""Prompt user for existing imatrix file.
Returns:
Path to user-provided imatrix, or None if not available.
"""
logger.info(f"Model directory: {model_dir}")
logger.info(f"Looking for imatrix file at: {imatrix_path}")
logger.info(
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
)
logger.info(
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
)
response = (
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
.strip()
.lower()
)
if response != "y":
return None
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found imatrix file! ({file_size})")
return imatrix_path
logger.warning("No imatrix.dat file found - continuing with automatic generation")
return None
def _get_calibration_file(self) -> Path | None:
"""Get calibration data file for imatrix generation.
Returns:
Path to calibration file, or None if not found.
"""
calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
if not calibration_file.exists():
logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
logger.info(
"Download from: https://gist.githubusercontent.com/bartowski1182/"
"eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
)
return None
return calibration_file
def _generate_new_imatrix(
self,
f16_model_path: Path,
llama_env: LlamaCppEnvironment,
imatrix_path: Path,
calibration_file: Path,
) -> Path | None:
"""Generate new importance matrix using calibration data.
Returns:
Path to generated imatrix, or None if generation fails.
"""
logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
logger.info(f"Model: {f16_model_path.name}")
logger.info(f"Calibration: {calibration_file}")
logger.info(f"Output: {imatrix_path}")
# Find imatrix binary
imatrix_binary = self._find_imatrix_binary(llama_env)
if not imatrix_binary:
logger.warning("llama-imatrix binary not found - skipping imatrix generation")
logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
return None
# Build and execute command
cmd = self._build_imatrix_command(
imatrix_binary, f16_model_path, calibration_file, imatrix_path
)
return self._execute_imatrix_generation(cmd, imatrix_path)
def _build_imatrix_command(
self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
) -> list[str]:
"""Build imatrix generation command.
Returns:
Command arguments as list.
"""
return [
str(binary),
"-m",
str(model_path),
"-f",
str(calibration_file),
"-o",
str(output_path),
"--process-output",
"--output-frequency",
"10",
"--save-frequency",
"50",
"-t",
"8",
"-c",
"2048",
"-b",
"512",
]
def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
"""Execute imatrix generation command with real-time output.
Returns:
Path to generated imatrix file, or None if generation fails.
"""
logger.info(f"Running: {' '.join(cmd)}")
logger.info("Starting imatrix generation... (progress will be shown)")
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)
self._stream_imatrix_output(process)
return_code = process.poll()
if return_code == 0:
return self._validate_imatrix_output(imatrix_path)
except KeyboardInterrupt:
logger.info("imatrix generation cancelled by user")
process.terminate()
return None
except Exception as e:
logger.error(f"imatrix generation failed with exception: {e}")
return None
else:
logger.error(f"imatrix generation failed with return code {return_code}")
return None
def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
"""Stream imatrix generation output in real-time."""
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
line = output.strip()
if self._should_log_imatrix_line(line):
logger.info(line)
def _should_log_imatrix_line(self, line: str) -> bool:
"""Determine if imatrix output line should be logged.
Returns:
True if line should be logged, False otherwise.
"""
keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
return any(keyword in line for keyword in keywords) or line.startswith("[")
def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
"""Validate generated imatrix file.
Returns:
Path to imatrix if valid, None otherwise.
"""
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"imatrix generation successful! ({file_size})")
return imatrix_path
logger.error("imatrix generation completed but file not found")
return None
def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
"""Find llama-imatrix binary in common locations.
Searches for the imatrix binary in the current directory and
standard installation paths.
Returns:
Path to imatrix binary, or None if not found.
"""
candidates = [
Path("./llama-imatrix"),
llama_env.quantise_binary.parent / "llama-imatrix",
Path("/usr/local/bin/llama-imatrix"),
Path("/usr/bin/llama-imatrix"),
]
for candidate in candidates:
if candidate.exists() and candidate.is_file():
return candidate
return None

View file

@ -0,0 +1,397 @@
"""Quantisation orchestration service.
High-level orchestration of the complete quantisation workflow from model
acquisition through processing to upload. Manages parallel processing,
status tracking, and cleanup operations for efficient resource utilisation.
"""
from __future__ import annotations
from concurrent.futures import Future, ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.tensor_mapping import URLParser
@dataclass(slots=True)
class QuantisationOrchestrator:
"""Orchestrates the complete quantisation workflow.
Uses dataclass with slots for efficient memory usage and dependency injection
for modular service interaction following SOLID principles.
"""
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
use_imatrix: bool = True
imatrix_base: str = "Q4_K_M"
no_upload: bool = False
# Service dependencies with factory defaults
url_parser: URLParser = field(default_factory=URLParser)
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
# Computed properties
models_dir: Path = field(init=False)
environment_manager: EnvironmentManager = field(init=False)
model_manager: ModelManager = field(init=False)
def __post_init__(self) -> None:
"""Initialise computed properties after dataclass construction."""
self.models_dir = self.work_dir / "models"
self.environment_manager = EnvironmentManager(self.work_dir)
self.model_manager = ModelManager(self.models_dir, self.environment_manager)
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
"""Main quantisation workflow orchestrating model processing from URL to upload.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
"""
logger.info("Starting Bartowski quantisation process...")
# Setup and preparation
model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
self._setup_environment(url)
)
# Create initial repository
self._create_initial_repository(model_source, output_repo)
# Execute all quantisations
results = self._execute_quantisations(
model_source, llama_env, f16_model_path, imatrix_path, output_repo
)
# Cleanup
self._cleanup_files(f16_model_path, model_source)
self._print_completion_summary(model_source, results, output_repo)
return results
def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
"""Setup environment and prepare model for quantisation.
Returns:
Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
"""
model_source = self.url_parser.parse(url)
self._print_model_info(model_source)
self.models_dir.mkdir(parents=True, exist_ok=True)
llama_env = self.environment_manager.setup()
f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
imatrix_path = None
if self.use_imatrix:
logger.info("Generating importance matrix (imatrix)...")
imatrix_path = self.imatrix_generator.generate_imatrix(
f16_model_path, llama_env, self.models_dir / model_source.model_name
)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
return model_source, llama_env, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
"""Create initial repository with planned quantisations."""
logger.info("Creating initial README with planned quantisations...")
planned_results = {
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
for qt in SUPPORTED_QUANTISATION_TYPES
}
readme_path = self.readme_generator.generate(
model_source, planned_results, self.models_dir, output_repo
)
if not self.no_upload:
logger.info("Creating repository with planned quantisations...")
self.uploader.upload_readme(output_repo, readme_path)
else:
logger.info("Skipping repository creation (--no-upload specified)")
def _execute_quantisations(
self,
model_source: ModelSource,
llama_env: Any,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
upload_futures: list[Future[None]] = []
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
for quant_type in SUPPORTED_QUANTISATION_TYPES:
result = self._process_single_quantisation(
quant_type,
model_source,
llama_env,
f16_model_path,
imatrix_path,
output_repo,
results,
upload_executor,
upload_futures,
)
results[quant_type] = result
self._wait_for_uploads(upload_futures)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
llama_env: Any,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
config = QUANTISATION_CONFIGS[quant_type]
# Update status to processing
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
llama_env=llama_env,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
base_quant=self.imatrix_base,
)
result = self.quantisation_engine.quantise(context)
self._handle_quantisation_result(
result,
quant_type,
model_source,
results,
output_repo,
upload_executor,
upload_futures,
)
except Exception as e:
return self._handle_quantisation_error(
e, quant_type, model_source, results, output_repo
)
else:
return result
def _handle_quantisation_result(
self,
result: QuantisationResult,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Handle successful or failed quantisation result."""
if result.success and result.file_path:
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
def _handle_quantisation_error(
self,
error: Exception,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> QuantisationResult:
"""Handle quantisation processing error.
Returns:
QuantisationResult: Failed quantisation result with error information.
"""
logger.error(f"Error processing {quant_type.value}: {error}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(error)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Update README with current quantisation status."""
if not self.no_upload:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
logger.info("Waiting for any remaining uploads to complete...")
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
except Exception as e:
logger.warning(f"Upload error: {e}")
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
"""Clean up temporary files after processing."""
if f16_model_path.exists():
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
f16_model_path.unlink()
if not model_source.is_gguf_repo:
self._cleanup_original_model(model_source)
def _cleanup_original_model(self, model_source: ModelSource) -> None:
"""Clean up original safetensors/PyTorch files after successful conversion."""
model_dir = self.models_dir / model_source.model_name
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
if pytorch_files:
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
for file in pytorch_files:
file.unlink()
logger.info("Keeping config files, tokeniser, and metadata for reference")
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Uploading {quant_type}...")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
file_path.unlink()
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
raise
def _print_model_info(self, model_source: ModelSource) -> None:
"""Print model information."""
logger.info(f"Source URL: {model_source.url}")
logger.info(f"Source model: {model_source.source_model}")
logger.info(f"Original author: {model_source.original_author}")
logger.info(f"Model name: {model_source.model_name}")
logger.info(f"Your HF username: {self.uploader.get_username()}")
logger.info(f"Working directory: {self.work_dir}")
def _print_completion_summary(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Print completion summary."""
successful_results = [r for r in results.values() if r.success]
if successful_results:
logger.info("Complete! Your quantised models are available at:")
logger.info(f" https://huggingface.co/{output_repo}")
logger.info("Model info:")
logger.info(f" - Source URL: {model_source.url}")
logger.info(f" - Original: {model_source.source_model}")
logger.info(
" - Method: "
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
)
logger.info(f" - Quantised: {output_repo}")
for result in successful_results:
if result.file_size:
filename = (
f"{model_source.original_author}-{model_source.model_name}-"
f"{result.quantisation_type}.gguf"
)
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
else:
logger.error(
"All quantisations failed - repository created with documentation "
"but no model files"
)
logger.error(f" Repository: https://huggingface.co/{output_repo}")

View file

@ -0,0 +1,486 @@
"""Quantisation operations service.
Provides modular quantisation engine, model management, and upload capabilities
for GGUF model processing. Consolidates quantisation logic from various tools
into reusable components following SOLID principles.
"""
from __future__ import annotations
import shutil
import subprocess
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import LlamaCppEnvironment
from helpers.services.llama_cpp import EnvironmentManager
class QuantisationEngine:
"""Handles the actual quantisation process with configurable methods.
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Encapsulates llama-quantize binary interactions with real-time output.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation with primary and fallback methods, handling
tensor-specific precision overrides and importance matrix guidance.
Returns:
QuantisationResult with success status and file information.
"""
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.info(f"📝 Source: {context.f16_model_path}")
logger.info(f"📝 Target: {output_path}")
# Try primary method
if self._try_quantisation_method(
context, output_path, context.config.tensor_types, "method 1"
):
return self._create_success_result(context.config.name, output_path, "method 1")
# Try fallback methods
for i, fallback_method in enumerate(context.config.fallback_methods, 2):
method_name = f"method {i}"
if self._try_quantisation_method(context, output_path, fallback_method, method_name):
return self._create_success_result(context.config.name, output_path, method_name)
logger.error("All %s quantisation methods failed", context.config.name)
return QuantisationResult(
quantisation_type=QuantisationType(context.config.name),
success=False,
error_message="All quantisation methods failed",
)
def _try_quantisation_method(
self,
context: QuantisationContext,
output_path: Path,
tensor_config: dict[str, str],
method_name: str,
) -> bool:
"""Try a specific quantisation method with real-time output.
Builds and executes llama-quantize command with appropriate parameters,
streaming output for progress monitoring.
Returns:
True if quantisation successful, False otherwise.
"""
logger.info(f"🔍 Trying {method_name}...")
cmd = self._build_quantisation_command(context, output_path, tensor_config)
return self._execute_quantisation_command(cmd, method_name)
def _build_quantisation_command(
self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
) -> list[str]:
"""Build quantisation command with all required parameters.
Returns:
List of command arguments.
"""
cmd = [str(context.llama_env.quantise_binary)]
# Add imatrix if available
if context.imatrix_path and context.imatrix_path.exists():
cmd.extend(["--imatrix", str(context.imatrix_path)])
logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
# Add tensor type arguments
self._add_tensor_type_arguments(cmd, tensor_config)
cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
return cmd
def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
"""Add tensor type arguments to command."""
if not tensor_config:
return
for tensor_name, quant_type in tensor_config.items():
if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
cmd.extend([f"--{tensor_name}", quant_type])
else:
cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
"""Execute quantisation command with real-time output.
Returns:
True if quantisation successful, False otherwise.
"""
logger.info(f"💻 Running: {' '.join(cmd)}")
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
)
self._stream_quantisation_output(process)
return_code = process.poll()
if return_code == 0:
logger.info(f"{method_name} quantisation successful!")
return True
except Exception as e:
logger.info(f"{method_name} failed with exception: {e}")
return False
else:
logger.info(f"{method_name} failed with return code {return_code}")
return False
def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
"""Stream quantisation output in real-time."""
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
logger.info(f"📊 {output.strip()}")
def _create_success_result(
self, quant_type: str, output_path: Path, method_used: str
) -> QuantisationResult:
"""Create successful quantisation result with file metadata.
Returns:
QuantisationResult with file path and size information.
"""
file_size = self.fs.get_file_size(output_path)
return QuantisationResult(
quantisation_type=QuantisationType(quant_type),
success=True,
file_path=output_path,
file_size=file_size,
method_used=method_used,
)
class ModelManager:
"""Handles model downloading and preparation for quantisation.
Manages both GGUF repository downloads and HuggingFace model conversions,
providing unified interface for model acquisition and preparation.
"""
def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
"""Initialise model manager with storage and environment configuration.
Sets up model storage directory and links to environment manager for
conversion script access and llama.cpp tool discovery.
"""
self.models_dir = models_dir
self.environment_manager = environment_manager
self.fs = FilesystemService()
def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
"""Prepare model for quantisation and return F16 model path.
Handles both GGUF repository downloads and regular HuggingFace model
conversion workflows with automatic format detection.
Returns:
Path to F16 GGUF model ready for quantisation.
"""
model_dir = self.models_dir / model_source.model_name
if model_source.is_gguf_repo:
return self._handle_gguf_repo(model_source, model_dir)
return self._handle_regular_repo(model_source, model_dir, llama_env)
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
"""Handle GGUF repository download with pattern matching.
Downloads GGUF files matching specified patterns, prioritising
multi-part files and F16 variants.
Returns:
Path to downloaded or existing GGUF file.
"""
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
return f16_model
# Check for existing GGUF files
model_dir.mkdir(parents=True, exist_ok=True)
existing_gguf = self.fs.find_gguf_files(model_dir)
if existing_gguf:
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
return existing_gguf[0]
# Download with patterns
downloaded_file = self._download_gguf_with_patterns(
model_source.source_model, model_source.gguf_file_pattern, model_dir
)
if downloaded_file:
# Handle multi-part files
if "00001-of-" in downloaded_file.name:
return downloaded_file
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
"-00003-of-", "-00001-of-"
)
first_part = downloaded_file.parent / base_name
if first_part.exists():
logger.info(f"🔄 Using first part: {first_part.name}")
return first_part
# Rename single file to standard name
downloaded_file.rename(f16_model)
return f16_model
# Fallback to regular conversion
logger.info("💡 Falling back to downloading full repository and converting...")
return self._handle_regular_repo(
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
model_dir,
None,
)
def _download_gguf_with_patterns(
self, source_model: str, pattern: str | None, model_dir: Path
) -> Path | None:
"""Download GGUF file using various pattern strategies.
Tries multiple pattern variations to find and download appropriate
GGUF files, handling timeouts and temporary directories.
Returns:
Path to downloaded file, or None if all patterns fail.
"""
if pattern:
patterns = [
f"*{pattern}*",
f"*{pattern.lower()}*",
f"*{pattern.upper()}*",
"*f16*",
"*F16*",
"*fp16*",
]
else:
patterns = ["*f16*", "*F16*", "*fp16*"]
temp_dir = model_dir / "gguf_temp"
for search_pattern in patterns:
logger.info(f"🔍 Trying pattern: {search_pattern}")
temp_dir.mkdir(exist_ok=True)
try:
subprocess.run(
[
"timeout",
"300",
"huggingface-cli",
"download",
source_model,
"--include",
search_pattern,
"--local-dir",
str(temp_dir),
],
check=True,
capture_output=True,
)
# Find downloaded GGUF files
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
if gguf_files:
found_file = gguf_files[0]
logger.info(f"✅ Found GGUF file: {found_file.name}")
# Move to parent directory
final_path = model_dir / found_file.name
shutil.move(str(found_file), str(final_path))
shutil.rmtree(temp_dir)
return final_path
except subprocess.CalledProcessError:
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
continue
finally:
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
return None
def _handle_regular_repo(
self,
model_source: ModelSource,
model_dir: Path,
llama_env: LlamaCppEnvironment | None,
) -> Path:
"""Handle regular HuggingFace repository conversion.
Downloads full model repository and converts to F16 GGUF format
using llama.cpp conversion scripts.
Returns:
Path to converted F16 GGUF model.
"""
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
if not model_dir.exists():
subprocess.run(
[
"huggingface-cli",
"download",
model_source.source_model,
"--local-dir",
str(model_dir),
],
check=True,
)
else:
logger.info("✅ Model already downloaded")
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
if not f16_model.exists():
if not llama_env:
llama_env = self.environment_manager.setup()
# Ensure conversion script is available
if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
logger.info("Getting conversion script from llama.cpp repository...")
llama_env = self.environment_manager.setup_repository()
subprocess.run(
[
*llama_env.convert_script.split(),
str(model_dir),
"--outtype",
"f16",
"--outfile",
str(f16_model),
],
check=True,
)
else:
logger.info("✅ F16 model already exists")
return f16_model
class HuggingFaceUploader:
"""Handles uploading models and documentation to HuggingFace.
Provides methods for repository creation, file uploads, and README
updates with proper error handling and retry logic.
"""
@staticmethod
def get_username() -> str:
"""Get authenticated HuggingFace username.
Returns:
HuggingFace username from CLI authentication.
Raises:
RuntimeError: If not authenticated.
"""
try:
result = subprocess.run(
["huggingface-cli", "whoami"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except (subprocess.CalledProcessError, FileNotFoundError) as err:
msg = "Please log in to HuggingFace first: huggingface-cli login"
raise RuntimeError(msg) from err
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
"""Upload or update README file to repository.
Creates repository if needed, handles existing repository updates.
"""
logger.info("Uploading README...")
try:
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
"--create",
],
check=True,
capture_output=True,
)
logger.info("README uploaded")
except subprocess.CalledProcessError:
# Repository exists, update without --create
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(readme_path),
"README.md",
],
check=True,
)
logger.info("README updated")
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
"""Upload model file to repository.
Uploads GGUF model file to specified repository path.
"""
logger.info(f"Uploading {model_path.name}...")
subprocess.run(
[
"huggingface-cli",
"upload",
output_repo,
str(model_path),
model_path.name,
],
check=True,
)
logger.info(f"{model_path.name} uploaded")