Initial commit
This commit is contained in:
commit
ef7df1a8c3
28 changed files with 6829 additions and 0 deletions
20
helpers/services/__init__.py
Normal file
20
helpers/services/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
"""Service layer for llm-gguf-tools.
|
||||
|
||||
Provides high-level service interfaces for interacting with external systems
|
||||
including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.services.huggingface import HuggingFaceService, ReadmeGenerator
|
||||
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
|
||||
|
||||
__all__ = [
|
||||
"EnvironmentManager",
|
||||
"FilesystemService",
|
||||
"HuggingFaceService",
|
||||
"IMatrixGenerator",
|
||||
"ReadmeGenerator",
|
||||
]
|
174
helpers/services/filesystem.py
Normal file
174
helpers/services/filesystem.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
"""Filesystem operations service.
|
||||
|
||||
Provides unified filesystem operations including file discovery, size
|
||||
calculation, and path management. Consolidates common filesystem patterns
|
||||
used across quantisation and conversion workflows.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
BYTES_PER_UNIT = 1024.0
|
||||
|
||||
|
||||
class FilesystemService:
|
||||
"""Handles filesystem operations with consistent error handling.
|
||||
|
||||
Provides methods for file discovery, size formatting, and JSON loading
|
||||
with proper error handling and logging. Ensures consistent behaviour
|
||||
across different tools and workflows.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_file_size(file_path: Path) -> str:
|
||||
"""Get human-readable file size using system utilities.
|
||||
|
||||
Attempts to use `du -h` for human-readable output, falling back to
|
||||
Python calculation if the system command fails. Provides consistent
|
||||
size formatting across the toolset.
|
||||
|
||||
Returns:
|
||||
Human-readable file size string (e.g., "1.5G", "750M").
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["du", "-h", str(file_path)], capture_output=True, text=True, check=True
|
||||
)
|
||||
return result.stdout.split()[0]
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
# Fallback to Python calculation
|
||||
|
||||
try:
|
||||
size_bytes: float = float(file_path.stat().st_size)
|
||||
for unit in ["B", "K", "M", "G", "T"]:
|
||||
if size_bytes < BYTES_PER_UNIT:
|
||||
return f"{size_bytes:.1f}{unit}"
|
||||
size_bytes /= BYTES_PER_UNIT
|
||||
except Exception:
|
||||
return "Unknown"
|
||||
else:
|
||||
return f"{size_bytes:.1f}P"
|
||||
|
||||
@staticmethod
|
||||
def load_json_config(config_path: Path) -> dict[str, Any]:
|
||||
"""Load and parse JSON configuration file.
|
||||
|
||||
Provides consistent JSON loading with proper error handling and
|
||||
encoding specification. Used for loading model configurations,
|
||||
tokeniser settings, and other JSON-based metadata.
|
||||
|
||||
Returns:
|
||||
Parsed JSON content as dictionary.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist.
|
||||
"""
|
||||
if not config_path.exists():
|
||||
msg = f"Configuration file not found: {config_path}"
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
with Path(config_path).open(encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def find_safetensor_files(model_path: Path) -> list[Path]:
|
||||
"""Find all SafeTensor files in model directory using priority search.
|
||||
|
||||
Searches for tensor files in order of preference: single model.safetensors,
|
||||
sharded model-*-of-*.safetensors files, then any *.safetensors files. This
|
||||
approach handles both single-file and multi-shard model distributions whilst
|
||||
ensuring predictable file ordering for conversion consistency.
|
||||
|
||||
Returns:
|
||||
List of SafeTensor file paths in priority order.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If no SafeTensor files are found.
|
||||
"""
|
||||
# Check for single file
|
||||
single_file = model_path / "model.safetensors"
|
||||
if single_file.exists():
|
||||
return [single_file]
|
||||
|
||||
# Check for sharded files
|
||||
pattern = "model-*-of-*.safetensors"
|
||||
sharded_files = sorted(model_path.glob(pattern))
|
||||
if sharded_files:
|
||||
return sharded_files
|
||||
|
||||
# Check for any safetensor files
|
||||
any_files = sorted(model_path.glob("*.safetensors"))
|
||||
if any_files:
|
||||
return any_files
|
||||
|
||||
msg = f"No SafeTensor files found in {model_path}"
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
@staticmethod
|
||||
def find_gguf_files(model_path: Path, pattern: str | None = None) -> list[Path]:
|
||||
"""Find GGUF files in directory, optionally filtered by pattern.
|
||||
|
||||
Searches for GGUF files with optional pattern matching. Prioritises
|
||||
multi-part files (00001-of-*) over single files for proper handling
|
||||
of large models split across multiple files.
|
||||
|
||||
Returns:
|
||||
List of GGUF file paths, sorted with multi-part files first.
|
||||
"""
|
||||
if pattern:
|
||||
gguf_files = list(model_path.glob(f"*{pattern}*.gguf"))
|
||||
else:
|
||||
gguf_files = list(model_path.glob("*.gguf"))
|
||||
|
||||
# Sort to prioritise 00001-of-* files
|
||||
gguf_files.sort(
|
||||
key=lambda x: (
|
||||
"00001-of-" not in x.name, # False sorts before True
|
||||
x.name,
|
||||
)
|
||||
)
|
||||
|
||||
return gguf_files
|
||||
|
||||
@staticmethod
|
||||
def ensure_directory(path: Path) -> Path:
|
||||
"""Ensure directory exists, creating if necessary.
|
||||
|
||||
Creates directory and all parent directories if they don't exist.
|
||||
Returns the path for method chaining convenience.
|
||||
|
||||
Returns:
|
||||
The directory path.
|
||||
"""
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
@staticmethod
|
||||
def cleanup_directory(path: Path, pattern: str = "*") -> int:
|
||||
"""Remove files matching pattern from directory.
|
||||
|
||||
Safely removes files matching the specified glob pattern. Returns
|
||||
count of files removed for logging purposes.
|
||||
|
||||
Returns:
|
||||
Number of files removed.
|
||||
"""
|
||||
if not path.exists():
|
||||
return 0
|
||||
|
||||
files_removed = 0
|
||||
for file_path in path.glob(pattern):
|
||||
if file_path.is_file():
|
||||
try:
|
||||
file_path.unlink()
|
||||
files_removed += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to remove {file_path}: {e}")
|
||||
|
||||
return files_removed
|
210
helpers/services/gguf.py
Normal file
210
helpers/services/gguf.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
"""GGUF file operations service.
|
||||
|
||||
Provides unified interface for creating, writing, and manipulating GGUF files.
|
||||
Consolidates GGUF-specific operations from conversion and quantisation workflows.
|
||||
Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import gguf
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from helpers.models.conversion import ModelConfig
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
"""Manages GGUF file creation and metadata writing.
|
||||
|
||||
Provides high-level interface for GGUF file operations including metadata
|
||||
configuration, tensor addition, and tokeniser integration. Encapsulates
|
||||
low-level GGUF library interactions for consistent error handling.
|
||||
"""
|
||||
|
||||
def __init__(self, output_path: Path, architecture: str) -> None:
|
||||
"""Initialise GGUF writer with output path and architecture.
|
||||
|
||||
Creates the underlying GGUF writer instance and prepares for metadata
|
||||
and tensor addition. Sets up the file structure for the specified
|
||||
model architecture.
|
||||
"""
|
||||
self.output_path = output_path
|
||||
self.architecture = architecture
|
||||
self.writer = gguf.GGUFWriter(str(output_path), architecture)
|
||||
logger.info(f"Created GGUF writer for {architecture} architecture")
|
||||
|
||||
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
|
||||
"""Add comprehensive metadata from model configuration.
|
||||
|
||||
Writes general model information, architectural parameters, and
|
||||
quantisation settings to the GGUF file header. Handles both standard
|
||||
and vision model configurations with appropriate parameter mapping.
|
||||
"""
|
||||
# General metadata
|
||||
self.writer.add_name(model_name)
|
||||
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
||||
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
||||
|
||||
# Model parameters from config
|
||||
params = model_config.to_gguf_params()
|
||||
self.writer.add_context_length(params.context_length)
|
||||
self.writer.add_embedding_length(params.embedding_length)
|
||||
self.writer.add_block_count(params.block_count)
|
||||
self.writer.add_feed_forward_length(params.feed_forward_length)
|
||||
self.writer.add_head_count(params.attention_head_count)
|
||||
self.writer.add_head_count_kv(params.attention_head_count_kv)
|
||||
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
|
||||
self.writer.add_rope_freq_base(params.rope_freq_base)
|
||||
self.writer.add_rope_dimension_count(params.rope_dimension_count)
|
||||
|
||||
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
|
||||
|
||||
def add_vision_metadata(self, vision_config: Any) -> None:
|
||||
"""Add vision model parameters to GGUF metadata.
|
||||
|
||||
Configures vision-specific parameters for multimodal models including
|
||||
embedding dimensions, attention heads, and spatial processing settings.
|
||||
"""
|
||||
if not vision_config:
|
||||
return
|
||||
|
||||
logger.info("Adding vision model parameters...")
|
||||
self.writer.add_vision_embedding_length(vision_config.hidden_size)
|
||||
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
|
||||
self.writer.add_vision_head_count(vision_config.num_attention_heads)
|
||||
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
|
||||
self.writer.add_vision_patch_size(vision_config.patch_size)
|
||||
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
|
||||
|
||||
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
|
||||
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
|
||||
|
||||
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
|
||||
"""Add tokeniser metadata to GGUF file.
|
||||
|
||||
Writes special token IDs and tokeniser model type to enable proper
|
||||
text processing during inference. Uses sensible defaults for missing
|
||||
configuration values.
|
||||
"""
|
||||
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
|
||||
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
||||
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
||||
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
||||
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
|
||||
|
||||
logger.info("Added tokeniser configuration")
|
||||
|
||||
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
||||
"""Add a tensor to the GGUF file.
|
||||
|
||||
Writes tensor data with the specified name to the file. Handles
|
||||
data type conversions and validates tensor shapes.
|
||||
"""
|
||||
self.writer.add_tensor(name, data)
|
||||
|
||||
def finalise(self) -> None:
|
||||
"""Write all data to file and close writer.
|
||||
|
||||
Completes the GGUF file creation by writing headers, key-value data,
|
||||
and tensor data in the correct order. Ensures proper file closure.
|
||||
"""
|
||||
logger.info(f"Writing GGUF file to {self.output_path}")
|
||||
self.writer.write_header_to_file()
|
||||
self.writer.write_kv_data_to_file()
|
||||
self.writer.write_tensors_to_file()
|
||||
self.writer.close()
|
||||
logger.info("GGUF file written successfully")
|
||||
|
||||
|
||||
class GGUFConverter:
|
||||
"""High-level GGUF conversion orchestrator.
|
||||
|
||||
Coordinates the complete conversion workflow from source models to GGUF
|
||||
format, managing metadata extraction, tensor mapping, and file writing.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def convert_safetensors(
|
||||
model_path: Path,
|
||||
output_path: Path,
|
||||
model_config: ModelConfig,
|
||||
architecture: str,
|
||||
tensor_mapper: Any,
|
||||
) -> bool:
|
||||
"""Convert SafeTensors model to GGUF format.
|
||||
|
||||
Orchestrates the conversion process including metadata setup, tensor
|
||||
loading with BFloat16 support, name mapping, and tokeniser integration.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"Converting {model_path.name} to GGUF...")
|
||||
|
||||
# Create writer
|
||||
writer_wrapper = GGUFWriter(output_path, architecture)
|
||||
|
||||
# Add metadata
|
||||
writer_wrapper.add_metadata(model_config, model_path.name)
|
||||
|
||||
# Add vision metadata if present
|
||||
if model_config.vision_config:
|
||||
writer_wrapper.add_vision_metadata(model_config.vision_config)
|
||||
|
||||
# Load and add tensors
|
||||
fs = FilesystemService()
|
||||
tensor_files = fs.find_safetensor_files(model_path)
|
||||
logger.info(f"Found {len(tensor_files)} tensor file(s)")
|
||||
|
||||
tensor_count = 0
|
||||
for tensor_file in tensor_files:
|
||||
logger.info(f"Loading {tensor_file.name}...")
|
||||
with safe_open(tensor_file, framework="pt") as f:
|
||||
for tensor_name in f:
|
||||
tensor_data = f.get_tensor(tensor_name)
|
||||
|
||||
# Convert BFloat16 to Float32
|
||||
if hasattr(tensor_data, "numpy"):
|
||||
if torch and tensor_data.dtype == torch.bfloat16:
|
||||
tensor_data = tensor_data.float()
|
||||
tensor_data = tensor_data.numpy()
|
||||
|
||||
# Map tensor name
|
||||
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
|
||||
|
||||
if gguf_name:
|
||||
writer_wrapper.add_tensor(gguf_name, tensor_data)
|
||||
tensor_count += 1
|
||||
|
||||
if tensor_count % 100 == 0:
|
||||
logger.info(f" Processed {tensor_count} tensors...")
|
||||
|
||||
logger.info(f"Total tensors processed: {tensor_count}")
|
||||
|
||||
# Add tokeniser
|
||||
try:
|
||||
tok_config = ConfigParser.load_tokeniser_config(model_path)
|
||||
writer_wrapper.add_tokeniser(tok_config)
|
||||
logger.info("Tokeniser added")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not add tokeniser: {e}")
|
||||
|
||||
# Finalise file
|
||||
writer_wrapper.finalise()
|
||||
|
||||
file_size = fs.get_file_size(output_path)
|
||||
logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
|
||||
|
||||
return True
|
454
helpers/services/huggingface.py
Normal file
454
helpers/services/huggingface.py
Normal file
|
@ -0,0 +1,454 @@
|
|||
"""HuggingFace operations service.
|
||||
|
||||
Handles all interactions with HuggingFace including model downloads,
|
||||
uploads, README generation, and repository management. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult
|
||||
|
||||
|
||||
class HuggingFaceService:
|
||||
"""Manages HuggingFace repository operations.
|
||||
|
||||
Provides methods for downloading models, uploading files, and managing
|
||||
repositories. Handles authentication, error recovery, and progress tracking
|
||||
for robust interaction with HuggingFace services.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Retrieves the current user's HuggingFace username using the CLI.
|
||||
Requires prior authentication via `huggingface-cli login`.
|
||||
|
||||
Returns:
|
||||
HuggingFace username.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated or CLI not available.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
@staticmethod
|
||||
def download_model(
|
||||
model_name: str, output_dir: Path, include_pattern: str | None = None
|
||||
) -> None:
|
||||
"""Download model from HuggingFace.
|
||||
|
||||
Downloads a complete model or specific files matching a pattern.
|
||||
Creates the output directory if it doesn't exist. Supports filtered
|
||||
downloads for efficient bandwidth usage when only certain files are needed.
|
||||
"""
|
||||
logger.info(f"Downloading {model_name} to {output_dir}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_name,
|
||||
"--local-dir",
|
||||
str(output_dir),
|
||||
]
|
||||
|
||||
if include_pattern:
|
||||
cmd.extend(["--include", include_pattern])
|
||||
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info("Download complete")
|
||||
|
||||
@staticmethod
|
||||
def upload_file(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str | None = None,
|
||||
create_repo: bool = False,
|
||||
) -> None:
|
||||
"""Upload a file to HuggingFace repository.
|
||||
|
||||
Uploads a single file to the specified repository path. Can create
|
||||
the repository if it doesn't exist. Handles repository creation conflicts
|
||||
gracefully by retrying without the create flag when needed.
|
||||
|
||||
Raises:
|
||||
CalledProcessError: If upload fails.
|
||||
"""
|
||||
repo_path = repo_path or local_path.name
|
||||
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(local_path),
|
||||
repo_path,
|
||||
]
|
||||
|
||||
if create_repo:
|
||||
cmd.append("--create")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
logger.info(f"Uploaded {repo_path}")
|
||||
except subprocess.CalledProcessError:
|
||||
if create_repo:
|
||||
# Repository might already exist, retry without --create
|
||||
cmd = cmd[:-1] # Remove --create flag
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info(f"Updated {repo_path}")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
class ReadmeGenerator:
|
||||
"""Generates README files for quantised models.
|
||||
|
||||
Creates comprehensive README documentation including model cards,
|
||||
quantisation details, and status tracking. Supports both initial
|
||||
planning documentation and final result summaries.
|
||||
"""
|
||||
|
||||
def generate(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate README file for quantised model repository.
|
||||
|
||||
Creates a comprehensive README with frontmatter, quantisation table,
|
||||
and original model information. Handles status tracking for planned,
|
||||
processing, and completed quantisations.
|
||||
|
||||
Returns:
|
||||
Path to generated README file.
|
||||
"""
|
||||
logger.info("Creating model card...")
|
||||
|
||||
model_dir = models_dir / model_source.model_name
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
# Get original README content
|
||||
original_content = self._get_original_readme(model_source, model_dir)
|
||||
|
||||
# Generate new README
|
||||
readme_content = self._generate_readme_content(
|
||||
model_source, results, original_content, output_repo
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
return readme_path
|
||||
|
||||
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
|
||||
"""Extract original README and metadata.
|
||||
|
||||
Downloads or reads the original model's README for inclusion in the
|
||||
quantised model documentation. Parses YAML frontmatter if present.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content, licence, tags, and frontmatter.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
# Try local file first
|
||||
readme_path = model_dir / "README.md"
|
||||
if readme_path.exists():
|
||||
content["readme"] = readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Found original README ({len(content['readme'])} characters)")
|
||||
else:
|
||||
# Download separately
|
||||
content = self._download_readme(model_source)
|
||||
|
||||
# Parse frontmatter if present
|
||||
if content["readme"].startswith("---\n"):
|
||||
content = self._parse_frontmatter(content["readme"])
|
||||
|
||||
return content
|
||||
|
||||
def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
|
||||
"""Download README from HuggingFace repository.
|
||||
|
||||
Attempts to download just the README.md file from the source repository
|
||||
for efficient documentation extraction.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content and default metadata.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
logger.info(f"Downloading README from {model_source.source_model}...")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_source.source_model,
|
||||
"--include",
|
||||
"README.md",
|
||||
"--local-dir",
|
||||
temp_dir,
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
readme_path = Path(temp_dir) / "README.md"
|
||||
if readme_path.exists():
|
||||
content["readme"] = readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Downloaded README ({len(content['readme'])} characters)")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Failed to download README: {e}")
|
||||
|
||||
return content
|
||||
|
||||
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
|
||||
"""Parse YAML frontmatter from README.
|
||||
|
||||
Extracts metadata from YAML frontmatter including licence, tags,
|
||||
and other model card fields.
|
||||
|
||||
Returns:
|
||||
Dictionary with separated content and metadata.
|
||||
"""
|
||||
lines = readme_text.split("\n")
|
||||
if lines[0] != "---":
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter_end = -1
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
if line == "---":
|
||||
frontmatter_end = i
|
||||
break
|
||||
|
||||
if frontmatter_end == -1:
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter = "\n".join(lines[1:frontmatter_end])
|
||||
content = "\n".join(lines[frontmatter_end + 1 :])
|
||||
|
||||
# Extract licence
|
||||
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
|
||||
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
|
||||
|
||||
# Extract tags
|
||||
tags = []
|
||||
in_tags = False
|
||||
for line in frontmatter.split("\n"):
|
||||
if line.startswith("tags:"):
|
||||
in_tags = True
|
||||
continue
|
||||
if in_tags:
|
||||
if line.startswith("- "):
|
||||
tags.append(line[2:].strip())
|
||||
elif line and not line.startswith(" "):
|
||||
break
|
||||
|
||||
return {
|
||||
"readme": content,
|
||||
"licence": licence_val,
|
||||
"tags": ",".join(tags),
|
||||
"frontmatter": frontmatter,
|
||||
}
|
||||
|
||||
def _generate_readme_content(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_content: dict[str, str],
|
||||
output_repo: str | None = None,
|
||||
) -> str:
|
||||
"""Generate complete README content with quantisation details.
|
||||
|
||||
Creates the full README including YAML frontmatter, quantisation status
|
||||
table, and original model information.
|
||||
|
||||
Returns:
|
||||
Complete README markdown content.
|
||||
"""
|
||||
# Build tags
|
||||
our_tags = [
|
||||
"quantised",
|
||||
"gguf",
|
||||
"q4_k_m",
|
||||
"q4_k_l",
|
||||
"q4_k_xl",
|
||||
"q4_k_xxl",
|
||||
"bartowski-method",
|
||||
]
|
||||
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
|
||||
all_tags = sorted(set(our_tags + original_tags))
|
||||
|
||||
# Build frontmatter
|
||||
frontmatter = f"""---
|
||||
license: {original_content["licence"]}
|
||||
library_name: gguf
|
||||
base_model: {model_source.source_model}
|
||||
tags:
|
||||
"""
|
||||
for tag in all_tags:
|
||||
if tag.strip():
|
||||
frontmatter += f"- {tag.strip()}\n"
|
||||
|
||||
frontmatter += "---\n\n"
|
||||
|
||||
# Build main content
|
||||
hf_url = f"https://huggingface.co/{model_source.source_model}"
|
||||
content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
|
||||
|
||||
GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's method.
|
||||
|
||||
| Quantisation | Embeddings/Output | Attention | Feed-Forward | Status |
|
||||
|--------------|-------------------|-----------|--------------|--------|
|
||||
"""
|
||||
|
||||
# Add results table
|
||||
for quant_type in [
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
QuantisationType.Q4_K_XL,
|
||||
QuantisationType.Q4_K_XXL,
|
||||
]:
|
||||
result = results.get(quant_type)
|
||||
if not result:
|
||||
result = type("Result", (), {"status": "planned", "success": False})()
|
||||
|
||||
layers = self._get_layers_config(quant_type)
|
||||
status = self._format_status(result, model_source, quant_type, output_repo)
|
||||
|
||||
content += (
|
||||
f"| {quant_type.value} | {layers['embeddings']} | "
|
||||
f"{layers['attention']} | {layers['ffn']} | {status} |\n"
|
||||
)
|
||||
|
||||
content += "\n---\n\n"
|
||||
|
||||
# Add original content
|
||||
if original_content["readme"]:
|
||||
content += "# Original Model Information\n\n" + original_content["readme"]
|
||||
else:
|
||||
content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model}).\n"
|
||||
|
||||
return frontmatter + content
|
||||
|
||||
def _get_layers_config(self, quant_type: QuantisationType) -> dict[str, str]:
|
||||
"""Get layer configuration for quantisation type.
|
||||
|
||||
Returns layer precision specifications for the quantisation table.
|
||||
|
||||
Returns:
|
||||
Dictionary with embeddings, attention, and ffn precision labels.
|
||||
"""
|
||||
configs = {
|
||||
QuantisationType.Q4_K_M: {
|
||||
"embeddings": "Q4_K_M",
|
||||
"attention": "Q4_K_M",
|
||||
"ffn": "Q4_K_M",
|
||||
},
|
||||
QuantisationType.Q4_K_L: {"embeddings": "Q6_K", "attention": "Q6_K", "ffn": "Q4_K_M"},
|
||||
QuantisationType.Q4_K_XL: {"embeddings": "Q8_0", "attention": "Q6_K", "ffn": "Q4_K_M"},
|
||||
QuantisationType.Q4_K_XXL: {"embeddings": "Q8_0", "attention": "Q8_0", "ffn": "Q4_K_M"},
|
||||
}
|
||||
return configs.get(
|
||||
quant_type, {"embeddings": "Unknown", "attention": "Unknown", "ffn": "Unknown"}
|
||||
)
|
||||
|
||||
def _format_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format status indicator for README table.
|
||||
|
||||
Creates appropriate status indicator based on quantisation state
|
||||
including progress indicators, file sizes, and download links.
|
||||
|
||||
Returns:
|
||||
Formatted status string for table cell.
|
||||
"""
|
||||
status_map = {
|
||||
"planned": "⏳ Planned",
|
||||
"processing": "🔄 Processing...",
|
||||
"uploading": "⬆️ Uploading...",
|
||||
"failed": "❌ Failed",
|
||||
}
|
||||
|
||||
if hasattr(result, "status") and result.status in status_map:
|
||||
base_status = status_map[result.status]
|
||||
|
||||
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
|
||||
return f"{base_status} ({result.file_size})"
|
||||
if result.status == "completed" or (hasattr(result, "success") and result.success):
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return base_status
|
||||
|
||||
# Legacy support
|
||||
if hasattr(result, "success") and result.success:
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return "❌ Failed"
|
||||
|
||||
def _format_success_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format successful quantisation status with download link.
|
||||
|
||||
Creates a download link if repository information is available,
|
||||
otherwise shows file size.
|
||||
|
||||
Returns:
|
||||
Formatted success status string.
|
||||
"""
|
||||
if not output_repo:
|
||||
return (
|
||||
f"✅ {result.file_size}"
|
||||
if hasattr(result, "file_size") and result.file_size
|
||||
else "✅ Available"
|
||||
)
|
||||
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
|
||||
)
|
||||
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
|
||||
|
||||
if hasattr(result, "file_size") and result.file_size:
|
||||
return f"[✅ {result.file_size}]({url})"
|
||||
return f"[✅ Available]({url})"
|
417
helpers/services/llama_cpp.py
Normal file
417
helpers/services/llama_cpp.py
Normal file
|
@ -0,0 +1,417 @@
|
|||
"""llama.cpp environment and operations service.
|
||||
|
||||
Manages llama.cpp binary discovery, environment setup, and imatrix generation.
|
||||
Provides consistent interface for interacting with llama.cpp tools across
|
||||
different installation methods.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import LlamaCppEnvironment
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
|
||||
class EnvironmentManager:
|
||||
"""Manages llama.cpp environment setup and binary discovery.
|
||||
|
||||
Handles detection of local binaries, repository setup, and conversion
|
||||
script location. Provides fallback strategies for different installation
|
||||
scenarios including local builds and repository-based setups.
|
||||
"""
|
||||
|
||||
def __init__(self, work_dir: Path) -> None:
|
||||
"""Initialise EnvironmentManager."""
|
||||
self.work_dir = work_dir
|
||||
self.llama_cpp_dir = work_dir / "llama.cpp"
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def setup(self) -> LlamaCppEnvironment:
|
||||
"""Set up llama.cpp environment with automatic detection.
|
||||
|
||||
Checks for local llama.cpp binaries first, then falls back to
|
||||
repository-based setup if needed. Handles conversion script location,
|
||||
dependency installation, and path resolution.
|
||||
|
||||
Returns:
|
||||
Configured LlamaCppEnvironment instance.
|
||||
"""
|
||||
# Check for local binaries first
|
||||
local_env = self._check_local_binaries()
|
||||
if local_env:
|
||||
return local_env
|
||||
|
||||
# Setup repository if needed
|
||||
return self.setup_repository()
|
||||
|
||||
def _check_local_binaries(self) -> LlamaCppEnvironment | None:
|
||||
"""Check for existing llama.cpp binaries in current directory.
|
||||
|
||||
Searches for quantise and CLI binaries in the current directory
|
||||
and standard installation paths. Also locates conversion scripts.
|
||||
|
||||
Returns:
|
||||
LlamaCppEnvironment if binaries found, None otherwise.
|
||||
"""
|
||||
quantise_bin = Path("./llama-quantize")
|
||||
cli_bin = Path("./llama-cli")
|
||||
|
||||
if not (quantise_bin.exists() and cli_bin.exists()):
|
||||
return None
|
||||
|
||||
logger.info("Found llama.cpp binaries in current directory")
|
||||
|
||||
# Check for conversion script
|
||||
convert_script = self._find_convert_script()
|
||||
if convert_script:
|
||||
logger.info(f"Found conversion script: {convert_script}")
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=quantise_bin.resolve(),
|
||||
cli_binary=cli_bin.resolve(),
|
||||
convert_script=convert_script,
|
||||
use_repo=False,
|
||||
)
|
||||
|
||||
logger.warning("No conversion script found in current directory")
|
||||
logger.info("Will use llama.cpp repository method for conversion")
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=quantise_bin.resolve(),
|
||||
cli_binary=cli_bin.resolve(),
|
||||
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
|
||||
use_repo=True,
|
||||
)
|
||||
|
||||
def _find_convert_script(self) -> str | None:
|
||||
"""Find conversion script in current directory.
|
||||
|
||||
Searches for various naming conventions of the HF to GGUF
|
||||
conversion script.
|
||||
|
||||
Returns:
|
||||
Command to run conversion script, or None if not found.
|
||||
"""
|
||||
scripts = [
|
||||
"./llama-convert-hf-to-gguf",
|
||||
"python3 ./convert_hf_to_gguf.py",
|
||||
"python3 ./convert-hf-to-gguf.py",
|
||||
]
|
||||
|
||||
for script in scripts:
|
||||
if script.startswith("python3"):
|
||||
script_path = script.split(" ", 1)[1]
|
||||
if Path(script_path).exists():
|
||||
return script
|
||||
elif Path(script).exists():
|
||||
return script
|
||||
return None
|
||||
|
||||
def setup_repository(self) -> LlamaCppEnvironment:
|
||||
"""Setup llama.cpp repository for conversion scripts.
|
||||
|
||||
Clones the llama.cpp repository if not present and installs
|
||||
Python dependencies for model conversion.
|
||||
|
||||
Returns:
|
||||
LlamaCppEnvironment configured with repository paths.
|
||||
"""
|
||||
if not self.llama_cpp_dir.exists():
|
||||
logger.info("Cloning llama.cpp for conversion script...")
|
||||
subprocess.run(
|
||||
[
|
||||
"git",
|
||||
"clone",
|
||||
"https://github.com/ggerganov/llama.cpp.git",
|
||||
str(self.llama_cpp_dir),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Install Python requirements
|
||||
logger.info("Installing Python requirements...")
|
||||
subprocess.run(
|
||||
[
|
||||
"pip3",
|
||||
"install",
|
||||
"-r",
|
||||
"requirements.txt",
|
||||
"--break-system-packages",
|
||||
"--root-user-action=ignore",
|
||||
],
|
||||
cwd=self.llama_cpp_dir,
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Install additional conversion dependencies
|
||||
logger.info("Installing additional conversion dependencies...")
|
||||
subprocess.run(
|
||||
[
|
||||
"pip3",
|
||||
"install",
|
||||
"transformers",
|
||||
"sentencepiece",
|
||||
"protobuf",
|
||||
"--break-system-packages",
|
||||
"--root-user-action=ignore",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("llama.cpp repository already exists")
|
||||
|
||||
# Use local binaries but repo conversion script
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=Path("./llama-quantize").resolve(),
|
||||
cli_binary=Path("./llama-cli").resolve(),
|
||||
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
|
||||
use_repo=False,
|
||||
)
|
||||
|
||||
|
||||
class IMatrixGenerator:
|
||||
"""Handles importance matrix generation for quantisation guidance.
|
||||
|
||||
Generates or locates importance matrices that guide quantisation
|
||||
decisions, helping preserve model quality by identifying critical
|
||||
tensors requiring higher precision.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixGenerator."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def generate_imatrix(
|
||||
self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Generate importance matrix for quantisation guidance.
|
||||
|
||||
Searches for existing imatrix files first, provides interactive
|
||||
prompts for user-supplied matrices, then generates new matrices
|
||||
using calibration data if necessary.
|
||||
|
||||
Returns:
|
||||
Path to imatrix file, or None if generation fails.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
# Check for existing imatrix
|
||||
if imatrix_path.exists():
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name}")
|
||||
return imatrix_path
|
||||
|
||||
# Try user-provided imatrix
|
||||
user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
|
||||
if user_imatrix:
|
||||
return user_imatrix
|
||||
|
||||
# Generate new imatrix
|
||||
calibration_file = self._get_calibration_file()
|
||||
if not calibration_file:
|
||||
return None
|
||||
|
||||
return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)
|
||||
|
||||
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
|
||||
"""Prompt user for existing imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to user-provided imatrix, or None if not available.
|
||||
"""
|
||||
logger.info(f"Model directory: {model_dir}")
|
||||
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
||||
logger.info(
|
||||
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
|
||||
)
|
||||
logger.info(
|
||||
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
|
||||
)
|
||||
|
||||
response = (
|
||||
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
|
||||
if response != "y":
|
||||
return None
|
||||
|
||||
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"Found imatrix file! ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
logger.warning("No imatrix.dat file found - continuing with automatic generation")
|
||||
return None
|
||||
|
||||
def _get_calibration_file(self) -> Path | None:
|
||||
"""Get calibration data file for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Path to calibration file, or None if not found.
|
||||
"""
|
||||
calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
|
||||
if not calibration_file.exists():
|
||||
logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
|
||||
logger.info(
|
||||
"Download from: https://gist.githubusercontent.com/bartowski1182/"
|
||||
"eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
|
||||
)
|
||||
return None
|
||||
return calibration_file
|
||||
|
||||
def _generate_new_imatrix(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
llama_env: LlamaCppEnvironment,
|
||||
imatrix_path: Path,
|
||||
calibration_file: Path,
|
||||
) -> Path | None:
|
||||
"""Generate new importance matrix using calibration data.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix, or None if generation fails.
|
||||
"""
|
||||
logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
|
||||
logger.info(f"Model: {f16_model_path.name}")
|
||||
logger.info(f"Calibration: {calibration_file}")
|
||||
logger.info(f"Output: {imatrix_path}")
|
||||
|
||||
# Find imatrix binary
|
||||
imatrix_binary = self._find_imatrix_binary(llama_env)
|
||||
if not imatrix_binary:
|
||||
logger.warning("llama-imatrix binary not found - skipping imatrix generation")
|
||||
logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
|
||||
return None
|
||||
|
||||
# Build and execute command
|
||||
cmd = self._build_imatrix_command(
|
||||
imatrix_binary, f16_model_path, calibration_file, imatrix_path
|
||||
)
|
||||
return self._execute_imatrix_generation(cmd, imatrix_path)
|
||||
|
||||
def _build_imatrix_command(
|
||||
self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
|
||||
) -> list[str]:
|
||||
"""Build imatrix generation command.
|
||||
|
||||
Returns:
|
||||
Command arguments as list.
|
||||
"""
|
||||
return [
|
||||
str(binary),
|
||||
"-m",
|
||||
str(model_path),
|
||||
"-f",
|
||||
str(calibration_file),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--process-output",
|
||||
"--output-frequency",
|
||||
"10",
|
||||
"--save-frequency",
|
||||
"50",
|
||||
"-t",
|
||||
"8",
|
||||
"-c",
|
||||
"2048",
|
||||
"-b",
|
||||
"512",
|
||||
]
|
||||
|
||||
def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
|
||||
"""Execute imatrix generation command with real-time output.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix file, or None if generation fails.
|
||||
"""
|
||||
logger.info(f"Running: {' '.join(cmd)}")
|
||||
logger.info("Starting imatrix generation... (progress will be shown)")
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
self._stream_imatrix_output(process)
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
return self._validate_imatrix_output(imatrix_path)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("imatrix generation cancelled by user")
|
||||
process.terminate()
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"imatrix generation failed with exception: {e}")
|
||||
return None
|
||||
else:
|
||||
logger.error(f"imatrix generation failed with return code {return_code}")
|
||||
return None
|
||||
|
||||
def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
|
||||
"""Stream imatrix generation output in real-time."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
line = output.strip()
|
||||
if self._should_log_imatrix_line(line):
|
||||
logger.info(line)
|
||||
|
||||
def _should_log_imatrix_line(self, line: str) -> bool:
|
||||
"""Determine if imatrix output line should be logged.
|
||||
|
||||
Returns:
|
||||
True if line should be logged, False otherwise.
|
||||
"""
|
||||
keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
|
||||
return any(keyword in line for keyword in keywords) or line.startswith("[")
|
||||
|
||||
def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
|
||||
"""Validate generated imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to imatrix if valid, None otherwise.
|
||||
"""
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"imatrix generation successful! ({file_size})")
|
||||
return imatrix_path
|
||||
logger.error("imatrix generation completed but file not found")
|
||||
return None
|
||||
|
||||
def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
|
||||
"""Find llama-imatrix binary in common locations.
|
||||
|
||||
Searches for the imatrix binary in the current directory and
|
||||
standard installation paths.
|
||||
|
||||
Returns:
|
||||
Path to imatrix binary, or None if not found.
|
||||
"""
|
||||
candidates = [
|
||||
Path("./llama-imatrix"),
|
||||
llama_env.quantise_binary.parent / "llama-imatrix",
|
||||
Path("/usr/local/bin/llama-imatrix"),
|
||||
Path("/usr/bin/llama-imatrix"),
|
||||
]
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and candidate.is_file():
|
||||
return candidate
|
||||
|
||||
return None
|
397
helpers/services/orchestrator.py
Normal file
397
helpers/services/orchestrator.py
Normal file
|
@ -0,0 +1,397 @@
|
|||
"""Quantisation orchestration service.
|
||||
|
||||
High-level orchestration of the complete quantisation workflow from model
|
||||
acquisition through processing to upload. Manages parallel processing,
|
||||
status tracking, and cleanup operations for efficient resource utilisation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.huggingface import ReadmeGenerator
|
||||
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
|
||||
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
||||
from helpers.utils.tensor_mapping import URLParser
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class QuantisationOrchestrator:
|
||||
"""Orchestrates the complete quantisation workflow.
|
||||
|
||||
Uses dataclass with slots for efficient memory usage and dependency injection
|
||||
for modular service interaction following SOLID principles.
|
||||
"""
|
||||
|
||||
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
||||
use_imatrix: bool = True
|
||||
imatrix_base: str = "Q4_K_M"
|
||||
no_upload: bool = False
|
||||
|
||||
# Service dependencies with factory defaults
|
||||
url_parser: URLParser = field(default_factory=URLParser)
|
||||
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
||||
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
||||
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
||||
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
||||
|
||||
# Computed properties
|
||||
models_dir: Path = field(init=False)
|
||||
environment_manager: EnvironmentManager = field(init=False)
|
||||
model_manager: ModelManager = field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialise computed properties after dataclass construction."""
|
||||
self.models_dir = self.work_dir / "models"
|
||||
self.environment_manager = EnvironmentManager(self.work_dir)
|
||||
self.model_manager = ModelManager(self.models_dir, self.environment_manager)
|
||||
|
||||
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
"""
|
||||
logger.info("Starting Bartowski quantisation process...")
|
||||
|
||||
# Setup and preparation
|
||||
model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
|
||||
self._setup_environment(url)
|
||||
)
|
||||
|
||||
# Create initial repository
|
||||
self._create_initial_repository(model_source, output_repo)
|
||||
|
||||
# Execute all quantisations
|
||||
results = self._execute_quantisations(
|
||||
model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
||||
)
|
||||
|
||||
# Cleanup
|
||||
self._cleanup_files(f16_model_path, model_source)
|
||||
|
||||
self._print_completion_summary(model_source, results, output_repo)
|
||||
return results
|
||||
|
||||
def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
|
||||
"""Setup environment and prepare model for quantisation.
|
||||
|
||||
Returns:
|
||||
Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
|
||||
"""
|
||||
model_source = self.url_parser.parse(url)
|
||||
self._print_model_info(model_source)
|
||||
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
llama_env = self.environment_manager.setup()
|
||||
|
||||
f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Generating importance matrix (imatrix)...")
|
||||
imatrix_path = self.imatrix_generator.generate_imatrix(
|
||||
f16_model_path, llama_env, self.models_dir / model_source.model_name
|
||||
)
|
||||
|
||||
output_repo = (
|
||||
f"{self.uploader.get_username()}/"
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
)
|
||||
|
||||
return model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
||||
|
||||
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
||||
"""Create initial repository with planned quantisations."""
|
||||
logger.info("Creating initial README with planned quantisations...")
|
||||
planned_results = {
|
||||
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
||||
for qt in SUPPORTED_QUANTISATION_TYPES
|
||||
}
|
||||
readme_path = self.readme_generator.generate(
|
||||
model_source, planned_results, self.models_dir, output_repo
|
||||
)
|
||||
|
||||
if not self.no_upload:
|
||||
logger.info("Creating repository with planned quantisations...")
|
||||
self.uploader.upload_readme(output_repo, readme_path)
|
||||
else:
|
||||
logger.info("Skipping repository creation (--no-upload specified)")
|
||||
|
||||
def _execute_quantisations(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
llama_env: Any,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Execute all quantisation types with parallel uploads.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
"""
|
||||
results: dict[QuantisationType, QuantisationResult] = {}
|
||||
upload_futures: list[Future[None]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
|
||||
for quant_type in SUPPORTED_QUANTISATION_TYPES:
|
||||
result = self._process_single_quantisation(
|
||||
quant_type,
|
||||
model_source,
|
||||
llama_env,
|
||||
f16_model_path,
|
||||
imatrix_path,
|
||||
output_repo,
|
||||
results,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
results[quant_type] = result
|
||||
|
||||
self._wait_for_uploads(upload_futures)
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_quantisation(
|
||||
self,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
llama_env: Any,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> QuantisationResult:
|
||||
"""Process a single quantisation type.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Result of the quantisation attempt.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting {quant_type.value} quantisation...")
|
||||
config = QUANTISATION_CONFIGS[quant_type]
|
||||
|
||||
# Update status to processing
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "processing"
|
||||
results[quant_type] = result
|
||||
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
# Perform quantisation
|
||||
context = QuantisationContext(
|
||||
f16_model_path=f16_model_path,
|
||||
model_source=model_source,
|
||||
config=config,
|
||||
llama_env=llama_env,
|
||||
models_dir=self.models_dir,
|
||||
imatrix_path=imatrix_path,
|
||||
base_quant=self.imatrix_base,
|
||||
)
|
||||
result = self.quantisation_engine.quantise(context)
|
||||
|
||||
self._handle_quantisation_result(
|
||||
result,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
output_repo,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
except Exception as e:
|
||||
return self._handle_quantisation_error(
|
||||
e, quant_type, model_source, results, output_repo
|
||||
)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _handle_quantisation_result(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> None:
|
||||
"""Handle successful or failed quantisation result."""
|
||||
if result.success and result.file_path:
|
||||
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
||||
logger.info(f"Starting parallel upload of {quant_str}...")
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_and_cleanup,
|
||||
output_repo,
|
||||
result.file_path,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
result.file_path = None # Mark as being uploaded
|
||||
result.status = "uploading"
|
||||
else:
|
||||
result.status = "failed"
|
||||
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
def _handle_quantisation_error(
|
||||
self,
|
||||
error: Exception,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> QuantisationResult:
|
||||
"""Handle quantisation processing error.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Failed quantisation result with error information.
|
||||
"""
|
||||
logger.error(f"Error processing {quant_type.value}: {error}")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "failed"
|
||||
result.error_message = str(error)
|
||||
|
||||
try:
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as readme_error:
|
||||
logger.error(f"Failed to update README after error: {readme_error}")
|
||||
|
||||
return result
|
||||
|
||||
def _update_readme_status(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Update README with current quantisation status."""
|
||||
if not self.no_upload:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
def _wait_for_uploads(self, upload_futures: list) -> None:
|
||||
"""Wait for all parallel uploads to complete."""
|
||||
logger.info("Waiting for any remaining uploads to complete...")
|
||||
for future in upload_futures:
|
||||
try:
|
||||
future.result(timeout=300) # 5 minute timeout per upload
|
||||
except Exception as e:
|
||||
logger.warning(f"Upload error: {e}")
|
||||
|
||||
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
||||
"""Clean up temporary files after processing."""
|
||||
if f16_model_path.exists():
|
||||
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
||||
f16_model_path.unlink()
|
||||
|
||||
if not model_source.is_gguf_repo:
|
||||
self._cleanup_original_model(model_source)
|
||||
|
||||
def _cleanup_original_model(self, model_source: ModelSource) -> None:
|
||||
"""Clean up original safetensors/PyTorch files after successful conversion."""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
|
||||
if pytorch_files:
|
||||
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
|
||||
for file in pytorch_files:
|
||||
file.unlink()
|
||||
|
||||
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
||||
|
||||
def _upload_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
) -> None:
|
||||
"""Upload file and clean up (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Uploading {quant_type}...")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
|
||||
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
|
||||
file_path.unlink()
|
||||
|
||||
results[quant_type].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
||||
results[quant_type].status = "failed"
|
||||
results[quant_type].error_message = str(e)
|
||||
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
raise
|
||||
|
||||
def _print_model_info(self, model_source: ModelSource) -> None:
|
||||
"""Print model information."""
|
||||
logger.info(f"Source URL: {model_source.url}")
|
||||
logger.info(f"Source model: {model_source.source_model}")
|
||||
logger.info(f"Original author: {model_source.original_author}")
|
||||
logger.info(f"Model name: {model_source.model_name}")
|
||||
logger.info(f"Your HF username: {self.uploader.get_username()}")
|
||||
logger.info(f"Working directory: {self.work_dir}")
|
||||
|
||||
def _print_completion_summary(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Print completion summary."""
|
||||
successful_results = [r for r in results.values() if r.success]
|
||||
|
||||
if successful_results:
|
||||
logger.info("Complete! Your quantised models are available at:")
|
||||
logger.info(f" https://huggingface.co/{output_repo}")
|
||||
logger.info("Model info:")
|
||||
logger.info(f" - Source URL: {model_source.url}")
|
||||
logger.info(f" - Original: {model_source.source_model}")
|
||||
logger.info(
|
||||
" - Method: "
|
||||
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
||||
)
|
||||
logger.info(f" - Quantised: {output_repo}")
|
||||
|
||||
for result in successful_results:
|
||||
if result.file_size:
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-"
|
||||
f"{result.quantisation_type}.gguf"
|
||||
)
|
||||
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
||||
else:
|
||||
logger.error(
|
||||
"All quantisations failed - repository created with documentation "
|
||||
"but no model files"
|
||||
)
|
||||
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|
486
helpers/services/quantisation.py
Normal file
486
helpers/services/quantisation.py
Normal file
|
@ -0,0 +1,486 @@
|
|||
"""Quantisation operations service.
|
||||
|
||||
Provides modular quantisation engine, model management, and upload capabilities
|
||||
for GGUF model processing. Consolidates quantisation logic from various tools
|
||||
into reusable components following SOLID principles.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import LlamaCppEnvironment
|
||||
from helpers.services.llama_cpp import EnvironmentManager
|
||||
|
||||
|
||||
class QuantisationEngine:
|
||||
"""Handles the actual quantisation process with configurable methods.
|
||||
|
||||
Provides flexible quantisation execution supporting multiple tensor
|
||||
precision configurations, importance matrices, and fallback strategies.
|
||||
Encapsulates llama-quantize binary interactions with real-time output.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation engine."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
||||
"""Perform quantisation using the specified configuration.
|
||||
|
||||
Executes quantisation with primary and fallback methods, handling
|
||||
tensor-specific precision overrides and importance matrix guidance.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with success status and file information.
|
||||
"""
|
||||
logger.info(
|
||||
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
||||
)
|
||||
|
||||
output_path = context.get_output_path()
|
||||
|
||||
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
||||
logger.info(f"📝 Source: {context.f16_model_path}")
|
||||
logger.info(f"📝 Target: {output_path}")
|
||||
|
||||
# Try primary method
|
||||
if self._try_quantisation_method(
|
||||
context, output_path, context.config.tensor_types, "method 1"
|
||||
):
|
||||
return self._create_success_result(context.config.name, output_path, "method 1")
|
||||
|
||||
# Try fallback methods
|
||||
for i, fallback_method in enumerate(context.config.fallback_methods, 2):
|
||||
method_name = f"method {i}"
|
||||
if self._try_quantisation_method(context, output_path, fallback_method, method_name):
|
||||
return self._create_success_result(context.config.name, output_path, method_name)
|
||||
|
||||
logger.error("All %s quantisation methods failed", context.config.name)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message="All quantisation methods failed",
|
||||
)
|
||||
|
||||
def _try_quantisation_method(
|
||||
self,
|
||||
context: QuantisationContext,
|
||||
output_path: Path,
|
||||
tensor_config: dict[str, str],
|
||||
method_name: str,
|
||||
) -> bool:
|
||||
"""Try a specific quantisation method with real-time output.
|
||||
|
||||
Builds and executes llama-quantize command with appropriate parameters,
|
||||
streaming output for progress monitoring.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"🔍 Trying {method_name}...")
|
||||
|
||||
cmd = self._build_quantisation_command(context, output_path, tensor_config)
|
||||
return self._execute_quantisation_command(cmd, method_name)
|
||||
|
||||
def _build_quantisation_command(
|
||||
self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
|
||||
) -> list[str]:
|
||||
"""Build quantisation command with all required parameters.
|
||||
|
||||
Returns:
|
||||
List of command arguments.
|
||||
"""
|
||||
cmd = [str(context.llama_env.quantise_binary)]
|
||||
|
||||
# Add imatrix if available
|
||||
if context.imatrix_path and context.imatrix_path.exists():
|
||||
cmd.extend(["--imatrix", str(context.imatrix_path)])
|
||||
logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
|
||||
|
||||
# Add tensor type arguments
|
||||
self._add_tensor_type_arguments(cmd, tensor_config)
|
||||
|
||||
cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
|
||||
return cmd
|
||||
|
||||
def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
|
||||
"""Add tensor type arguments to command."""
|
||||
if not tensor_config:
|
||||
return
|
||||
|
||||
for tensor_name, quant_type in tensor_config.items():
|
||||
if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
|
||||
cmd.extend([f"--{tensor_name}", quant_type])
|
||||
else:
|
||||
cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
|
||||
|
||||
def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
|
||||
"""Execute quantisation command with real-time output.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"💻 Running: {' '.join(cmd)}")
|
||||
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
self._stream_quantisation_output(process)
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
logger.info(f"✅ {method_name} quantisation successful!")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.info(f"❌ {method_name} failed with exception: {e}")
|
||||
return False
|
||||
else:
|
||||
logger.info(f"❌ {method_name} failed with return code {return_code}")
|
||||
return False
|
||||
|
||||
def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
|
||||
"""Stream quantisation output in real-time."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
logger.info(f"📊 {output.strip()}")
|
||||
|
||||
def _create_success_result(
|
||||
self, quant_type: str, output_path: Path, method_used: str
|
||||
) -> QuantisationResult:
|
||||
"""Create successful quantisation result with file metadata.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with file path and size information.
|
||||
"""
|
||||
file_size = self.fs.get_file_size(output_path)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(quant_type),
|
||||
success=True,
|
||||
file_path=output_path,
|
||||
file_size=file_size,
|
||||
method_used=method_used,
|
||||
)
|
||||
|
||||
|
||||
class ModelManager:
|
||||
"""Handles model downloading and preparation for quantisation.
|
||||
|
||||
Manages both GGUF repository downloads and HuggingFace model conversions,
|
||||
providing unified interface for model acquisition and preparation.
|
||||
"""
|
||||
|
||||
def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
|
||||
"""Initialise model manager with storage and environment configuration.
|
||||
|
||||
Sets up model storage directory and links to environment manager for
|
||||
conversion script access and llama.cpp tool discovery.
|
||||
"""
|
||||
self.models_dir = models_dir
|
||||
self.environment_manager = environment_manager
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
|
||||
"""Prepare model for quantisation and return F16 model path.
|
||||
|
||||
Handles both GGUF repository downloads and regular HuggingFace model
|
||||
conversion workflows with automatic format detection.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model ready for quantisation.
|
||||
"""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
if model_source.is_gguf_repo:
|
||||
return self._handle_gguf_repo(model_source, model_dir)
|
||||
return self._handle_regular_repo(model_source, model_dir, llama_env)
|
||||
|
||||
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Handle GGUF repository download with pattern matching.
|
||||
|
||||
Downloads GGUF files matching specified patterns, prioritising
|
||||
multi-part files and F16 variants.
|
||||
|
||||
Returns:
|
||||
Path to downloaded or existing GGUF file.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
||||
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
||||
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
||||
return f16_model
|
||||
|
||||
# Check for existing GGUF files
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
existing_gguf = self.fs.find_gguf_files(model_dir)
|
||||
|
||||
if existing_gguf:
|
||||
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
|
||||
return existing_gguf[0]
|
||||
|
||||
# Download with patterns
|
||||
downloaded_file = self._download_gguf_with_patterns(
|
||||
model_source.source_model, model_source.gguf_file_pattern, model_dir
|
||||
)
|
||||
|
||||
if downloaded_file:
|
||||
# Handle multi-part files
|
||||
if "00001-of-" in downloaded_file.name:
|
||||
return downloaded_file
|
||||
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
|
||||
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
|
||||
"-00003-of-", "-00001-of-"
|
||||
)
|
||||
first_part = downloaded_file.parent / base_name
|
||||
if first_part.exists():
|
||||
logger.info(f"🔄 Using first part: {first_part.name}")
|
||||
return first_part
|
||||
|
||||
# Rename single file to standard name
|
||||
downloaded_file.rename(f16_model)
|
||||
return f16_model
|
||||
|
||||
# Fallback to regular conversion
|
||||
logger.info("💡 Falling back to downloading full repository and converting...")
|
||||
return self._handle_regular_repo(
|
||||
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
|
||||
model_dir,
|
||||
None,
|
||||
)
|
||||
|
||||
def _download_gguf_with_patterns(
|
||||
self, source_model: str, pattern: str | None, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Download GGUF file using various pattern strategies.
|
||||
|
||||
Tries multiple pattern variations to find and download appropriate
|
||||
GGUF files, handling timeouts and temporary directories.
|
||||
|
||||
Returns:
|
||||
Path to downloaded file, or None if all patterns fail.
|
||||
"""
|
||||
if pattern:
|
||||
patterns = [
|
||||
f"*{pattern}*",
|
||||
f"*{pattern.lower()}*",
|
||||
f"*{pattern.upper()}*",
|
||||
"*f16*",
|
||||
"*F16*",
|
||||
"*fp16*",
|
||||
]
|
||||
else:
|
||||
patterns = ["*f16*", "*F16*", "*fp16*"]
|
||||
|
||||
temp_dir = model_dir / "gguf_temp"
|
||||
|
||||
for search_pattern in patterns:
|
||||
logger.info(f"🔍 Trying pattern: {search_pattern}")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"timeout",
|
||||
"300",
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--include",
|
||||
search_pattern,
|
||||
"--local-dir",
|
||||
str(temp_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Find downloaded GGUF files
|
||||
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
|
||||
if gguf_files:
|
||||
found_file = gguf_files[0]
|
||||
logger.info(f"✅ Found GGUF file: {found_file.name}")
|
||||
|
||||
# Move to parent directory
|
||||
final_path = model_dir / found_file.name
|
||||
shutil.move(str(found_file), str(final_path))
|
||||
shutil.rmtree(temp_dir)
|
||||
return final_path
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
|
||||
continue
|
||||
finally:
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return None
|
||||
|
||||
def _handle_regular_repo(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
llama_env: LlamaCppEnvironment | None,
|
||||
) -> Path:
|
||||
"""Handle regular HuggingFace repository conversion.
|
||||
|
||||
Downloads full model repository and converts to F16 GGUF format
|
||||
using llama.cpp conversion scripts.
|
||||
|
||||
Returns:
|
||||
Path to converted F16 GGUF model.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
|
||||
|
||||
if not model_dir.exists():
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_source.source_model,
|
||||
"--local-dir",
|
||||
str(model_dir),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("✅ Model already downloaded")
|
||||
|
||||
logger.info("🔄 Converting to GGUF F16 format...")
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if not f16_model.exists():
|
||||
if not llama_env:
|
||||
llama_env = self.environment_manager.setup()
|
||||
|
||||
# Ensure conversion script is available
|
||||
if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
|
||||
logger.info("Getting conversion script from llama.cpp repository...")
|
||||
llama_env = self.environment_manager.setup_repository()
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
*llama_env.convert_script.split(),
|
||||
str(model_dir),
|
||||
"--outtype",
|
||||
"f16",
|
||||
"--outfile",
|
||||
str(f16_model),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("✅ F16 model already exists")
|
||||
|
||||
return f16_model
|
||||
|
||||
|
||||
class HuggingFaceUploader:
|
||||
"""Handles uploading models and documentation to HuggingFace.
|
||||
|
||||
Provides methods for repository creation, file uploads, and README
|
||||
updates with proper error handling and retry logic.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Returns:
|
||||
HuggingFace username from CLI authentication.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
|
||||
"""Upload or update README file to repository.
|
||||
|
||||
Creates repository if needed, handles existing repository updates.
|
||||
"""
|
||||
logger.info("Uploading README...")
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
"--create",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
logger.info("README uploaded")
|
||||
except subprocess.CalledProcessError:
|
||||
# Repository exists, update without --create
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
logger.info("README updated")
|
||||
|
||||
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
|
||||
"""Upload model file to repository.
|
||||
|
||||
Uploads GGUF model file to specified repository path.
|
||||
"""
|
||||
logger.info(f"Uploading {model_path.name}...")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(model_path),
|
||||
model_path.name,
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
logger.info(f"{model_path.name} uploaded")
|
Loading…
Add table
Add a link
Reference in a new issue