Compare commits
3 commits
main
...
tom/migrat
Author | SHA256 | Date | |
---|---|---|---|
21d8c03aea | |||
de6b853175 | |||
633efdc305 |
47 changed files with 6335 additions and 3082 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -58,3 +58,4 @@ venv.bak/
|
|||
# Working directories
|
||||
work/
|
||||
quantisation_work/
|
||||
.cache/
|
||||
|
|
|
@ -11,6 +11,19 @@ from __future__ import annotations
|
|||
from helpers.models.quantisation import QuantisationConfig, QuantisationType
|
||||
|
||||
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
||||
# Basic quantisation profiles
|
||||
QuantisationType.Q2_0: QuantisationConfig(
|
||||
name="Q2_0",
|
||||
description="Basic Q2_0 quantisation (2-bit, smallest)",
|
||||
base_precision=2,
|
||||
base_type="Q2_0",
|
||||
),
|
||||
QuantisationType.Q3_0: QuantisationConfig(
|
||||
name="Q3_0",
|
||||
description="Basic Q3_0 quantisation (3-bit)",
|
||||
base_precision=3,
|
||||
base_type="Q3_0",
|
||||
),
|
||||
# Standard quantisation profiles
|
||||
QuantisationType.Q2_K: QuantisationConfig(
|
||||
name="Q2_K",
|
||||
|
@ -46,15 +59,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
|
||||
base_type="Q3_K_M",
|
||||
base_precision=3,
|
||||
output_type="Q5_K",
|
||||
output_type="q5_k",
|
||||
),
|
||||
QuantisationType.Q3_K_XL: QuantisationConfig(
|
||||
name="Q3_K_XL",
|
||||
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
|
||||
base_type="Q3_K_M",
|
||||
base_precision=3,
|
||||
embedding_type="Q8_0",
|
||||
output_type="Q6_K",
|
||||
embedding_type="q8_0",
|
||||
output_type="q6_k",
|
||||
),
|
||||
QuantisationType.Q4_K_S: QuantisationConfig(
|
||||
name="Q4_K_S",
|
||||
|
@ -78,7 +91,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
|
||||
base_type="Q4_K_M",
|
||||
base_precision=4,
|
||||
embedding_type="Q8_0",
|
||||
embedding_type="q8_0",
|
||||
),
|
||||
# Additional standard quantisation profiles
|
||||
QuantisationType.Q5_K_S: QuantisationConfig(
|
||||
|
@ -103,7 +116,13 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
|
||||
base_type="Q5_K_M",
|
||||
base_precision=5,
|
||||
embedding_type="Q8_0",
|
||||
embedding_type="q8_0",
|
||||
),
|
||||
QuantisationType.Q6_0: QuantisationConfig(
|
||||
name="Q6_0",
|
||||
description="Basic Q6_0 quantisation (6-bit)",
|
||||
base_precision=6,
|
||||
base_type="Q6_0",
|
||||
),
|
||||
QuantisationType.Q6_K: QuantisationConfig(
|
||||
name="Q6_K",
|
||||
|
@ -121,11 +140,17 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
|
||||
base_type="Q6_K",
|
||||
base_precision=6,
|
||||
output_type="Q8_0",
|
||||
output_type="q8_0",
|
||||
),
|
||||
QuantisationType.Q8_K: QuantisationConfig(
|
||||
name="Q8_K",
|
||||
description="Q8_K quantisation (highest quality, largest size)",
|
||||
base_precision=8,
|
||||
base_type="Q8_K",
|
||||
),
|
||||
QuantisationType.Q8_0: QuantisationConfig(
|
||||
name="Q8_0",
|
||||
description="Q8_0 quantisation (highest quality, largest size)",
|
||||
description="Basic Q8_0 quantisation (8-bit flat)",
|
||||
base_precision=8,
|
||||
base_type="Q8_0",
|
||||
),
|
||||
|
@ -157,46 +182,57 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
}
|
||||
|
||||
|
||||
# Default profile set for optimal quality/size balance
|
||||
DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
|
||||
# Q3 variants (smallest)
|
||||
QuantisationType.Q3_K_M,
|
||||
QuantisationType.Q3_K_L,
|
||||
QuantisationType.Q3_K_XL,
|
||||
# Q4 variants
|
||||
QuantisationType.Q4_0, # Basic - always available
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
# Q5 variants
|
||||
QuantisationType.Q5_0, # Basic - always available
|
||||
QuantisationType.Q5_K_M,
|
||||
QuantisationType.Q5_K_L,
|
||||
# Q6 variants
|
||||
QuantisationType.Q6_0, # Basic - always available
|
||||
QuantisationType.Q6_K,
|
||||
QuantisationType.Q6_K_L,
|
||||
QuantisationType.Q8_0,
|
||||
# Q8 variants (largest)
|
||||
QuantisationType.Q8_0, # Basic - always available
|
||||
QuantisationType.Q8_K,
|
||||
]
|
||||
|
||||
|
||||
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
|
||||
# Q2 variants
|
||||
QuantisationType.Q2_0,
|
||||
QuantisationType.Q2_K,
|
||||
QuantisationType.Q2_K_S,
|
||||
# Q3 K-quants
|
||||
QuantisationType.Q3_0,
|
||||
QuantisationType.Q3_K_S,
|
||||
QuantisationType.Q3_K_M,
|
||||
QuantisationType.Q3_K_L,
|
||||
QuantisationType.Q3_K_XL,
|
||||
# Q4 K-quants
|
||||
QuantisationType.Q4_0,
|
||||
QuantisationType.Q4_1,
|
||||
QuantisationType.Q4_K_S,
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
# Q5 K-quants
|
||||
QuantisationType.Q5_0,
|
||||
QuantisationType.Q5_1,
|
||||
QuantisationType.Q5_K_S,
|
||||
QuantisationType.Q5_K_M,
|
||||
QuantisationType.Q5_K_L,
|
||||
# Q6_K
|
||||
QuantisationType.Q6_0,
|
||||
QuantisationType.Q6_K,
|
||||
QuantisationType.Q6_K_L,
|
||||
# Q8_0
|
||||
QuantisationType.Q8_0,
|
||||
# Legacy formats
|
||||
QuantisationType.Q4_0,
|
||||
QuantisationType.Q4_1,
|
||||
QuantisationType.Q5_0,
|
||||
QuantisationType.Q5_1,
|
||||
QuantisationType.Q8_K,
|
||||
]
|
||||
|
|
17
helpers/filesystem/__init__.py
Normal file
17
helpers/filesystem/__init__.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
"""Filesystem operations and management.
|
||||
|
||||
Provides utilities for file cleanup, workspace management, and
|
||||
directory operations throughout the quantisation workflow.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.filesystem.cleanup import FileCleanup
|
||||
from helpers.filesystem.operations import FilesystemService
|
||||
from helpers.filesystem.workspace import WorkspaceManager
|
||||
|
||||
__all__ = [
|
||||
"FileCleanup",
|
||||
"FilesystemService",
|
||||
"WorkspaceManager",
|
||||
]
|
81
helpers/filesystem/cleanup.py
Normal file
81
helpers/filesystem/cleanup.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
"""File cleanup operations for the quantisation workflow.
|
||||
|
||||
Manages removal of temporary files, model cleanup after processing,
|
||||
and disk space recovery during quantisation operations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from shutil import rmtree as shutil_rmtree
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import ModelSource
|
||||
|
||||
|
||||
class FileCleanup:
|
||||
"""Handles cleanup of temporary and intermediate files.
|
||||
|
||||
Provides methods for removing processed model files, temporary
|
||||
conversions, and other artifacts to manage disk space efficiently
|
||||
during quantisation workflows.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def cleanup_files(f16_model_path: Path, model_source: ModelSource, models_dir: Path) -> None:
|
||||
"""Clean up temporary files after processing.
|
||||
|
||||
Removes F16 model and original format files to save disk space
|
||||
after successful quantisation and upload. Processes both F16
|
||||
GGUF files and original model formats to maximise storage recovery.
|
||||
"""
|
||||
if f16_model_path.exists():
|
||||
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
||||
f16_model_path.unlink()
|
||||
|
||||
if not model_source.is_gguf_repo:
|
||||
FileCleanup.cleanup_original_model(model_source, models_dir)
|
||||
|
||||
@staticmethod
|
||||
def cleanup_original_model(model_source: ModelSource, models_dir: Path) -> None:
|
||||
"""Clean up original model files after successful conversion.
|
||||
|
||||
Removes SafeTensors files to save disk space whilst preserving
|
||||
configuration, tokeniser, and metadata files for reference. The
|
||||
design prioritises space efficiency over re-conversion capability.
|
||||
"""
|
||||
model_dir = models_dir / model_source.model_name
|
||||
|
||||
safetensor_files = list(model_dir.glob("*.safetensors"))
|
||||
if safetensor_files:
|
||||
logger.info(f"Removing {len(safetensor_files)} SafeTensors files...")
|
||||
for file in safetensor_files:
|
||||
file.unlink()
|
||||
|
||||
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
||||
|
||||
@staticmethod
|
||||
def cleanup_quantisation_file(file_path: Path) -> None:
|
||||
"""Remove a single quantisation file.
|
||||
|
||||
Safely removes the specified file with existence checking and
|
||||
logging for disk space management during quantisation workflows.
|
||||
"""
|
||||
if file_path.exists():
|
||||
logger.info(f"Removing {file_path.name} to save disk space...")
|
||||
file_path.unlink()
|
||||
|
||||
@staticmethod
|
||||
def cleanup_temp_directory(temp_dir: Path) -> None:
|
||||
"""Clean up a temporary directory and all its contents.
|
||||
|
||||
Recursively removes the directory and all subdirectories with
|
||||
error tolerance to handle locked or missing files gracefully.
|
||||
"""
|
||||
if temp_dir.exists() and temp_dir.is_dir():
|
||||
logger.debug(f"Cleaning up temporary directory: {temp_dir}")
|
||||
shutil_rmtree(temp_dir, ignore_errors=True)
|
|
@ -1,8 +1,7 @@
|
|||
"""Filesystem operations service.
|
||||
"""Core filesystem operations.
|
||||
|
||||
Provides unified filesystem operations including file discovery, size
|
||||
calculation, and path management. Consolidates common filesystem patterns
|
||||
used across quantisation and conversion workflows.
|
||||
calculation, and path management for quantisation workflows.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
@ -21,8 +20,7 @@ class FilesystemService:
|
|||
"""Handles filesystem operations with consistent error handling.
|
||||
|
||||
Provides methods for file discovery, size formatting, and JSON loading
|
||||
with proper error handling and logging. Ensures consistent behaviour
|
||||
across different tools and workflows.
|
||||
with proper error handling and logging.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -31,10 +29,10 @@ class FilesystemService:
|
|||
|
||||
Attempts to use `du -h` for human-readable output, falling back to
|
||||
Python calculation if the system command fails. Provides consistent
|
||||
size formatting across the toolset.
|
||||
formatting across different platforms and file sizes.
|
||||
|
||||
Returns:
|
||||
Human-readable file size string (e.g., "1.5G", "750M").
|
||||
Human-readable file size string (e.g. "1.5G", "750M").
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
|
@ -43,7 +41,6 @@ class FilesystemService:
|
|||
return result.stdout.split()[0]
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
# Fallback to Python calculation
|
||||
|
||||
try:
|
||||
size_bytes: float = float(file_path.stat().st_size)
|
||||
for unit in ["B", "K", "M", "G", "T"]:
|
||||
|
@ -60,8 +57,7 @@ class FilesystemService:
|
|||
"""Load and parse JSON configuration file.
|
||||
|
||||
Provides consistent JSON loading with proper error handling and
|
||||
encoding specification. Used for loading model configurations,
|
||||
tokeniser settings, and other JSON-based metadata.
|
||||
UTF-8 encoding specification for cross-platform compatibility.
|
||||
|
||||
Returns:
|
||||
Parsed JSON content as dictionary.
|
||||
|
@ -81,9 +77,8 @@ class FilesystemService:
|
|||
"""Find all SafeTensor files in model directory using priority search.
|
||||
|
||||
Searches for tensor files in order of preference: single model.safetensors,
|
||||
sharded model-*-of-*.safetensors files, then any *.safetensors files. This
|
||||
approach handles both single-file and multi-shard model distributions whilst
|
||||
ensuring predictable file ordering for conversion consistency.
|
||||
sharded model-*-of-*.safetensors files, then any *.safetensors files.
|
||||
The prioritisation ensures optimal handling of different model formats.
|
||||
|
||||
Returns:
|
||||
List of SafeTensor file paths in priority order.
|
||||
|
@ -116,7 +111,7 @@ class FilesystemService:
|
|||
|
||||
Searches for GGUF files with optional pattern matching. Prioritises
|
||||
multi-part files (00001-of-*) over single files for proper handling
|
||||
of large models split across multiple files.
|
||||
of sharded model architectures.
|
||||
|
||||
Returns:
|
||||
List of GGUF file paths, sorted with multi-part files first.
|
||||
|
@ -140,8 +135,8 @@ class FilesystemService:
|
|||
def ensure_directory(path: Path) -> Path:
|
||||
"""Ensure directory exists, creating if necessary.
|
||||
|
||||
Creates directory and all parent directories if they don't exist.
|
||||
Returns the path for method chaining convenience.
|
||||
Creates directory and all parent directories if they don't exist,
|
||||
using atomic operations to handle concurrent access gracefully.
|
||||
|
||||
Returns:
|
||||
The directory path.
|
||||
|
@ -153,8 +148,8 @@ class FilesystemService:
|
|||
def cleanup_directory(path: Path, pattern: str = "*") -> int:
|
||||
"""Remove files matching pattern from directory.
|
||||
|
||||
Safely removes files matching the specified glob pattern. Returns
|
||||
count of files removed for logging purposes.
|
||||
Safely removes files matching the specified glob pattern with
|
||||
comprehensive error handling to prevent workflow interruption.
|
||||
|
||||
Returns:
|
||||
Number of files removed.
|
146
helpers/filesystem/workspace.py
Normal file
146
helpers/filesystem/workspace.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
"""Workspace management for quantisation operations.
|
||||
|
||||
Manages working directories, model storage paths, and temporary
|
||||
file locations throughout the quantisation workflow.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from shutil import disk_usage as shutil_disk_usage, rmtree as shutil_rmtree
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
|
||||
class WorkspaceManager:
|
||||
"""Manages workspace directories and paths.
|
||||
|
||||
Provides centralised management of working directories, model
|
||||
storage, and temporary file locations with automatic directory
|
||||
creation and validation.
|
||||
"""
|
||||
|
||||
def __init__(self, work_dir: Path | None = None) -> None:
|
||||
"""Initialise workspace manager.
|
||||
|
||||
Sets up base working directory structure with models and temporary
|
||||
file directories. Defaults to quantisation_work in current directory
|
||||
if no path is specified.
|
||||
"""
|
||||
self.work_dir = work_dir or Path.cwd() / "quantisation_work"
|
||||
self.models_dir = self.work_dir / "models"
|
||||
self._setup_directories()
|
||||
|
||||
def _setup_directories(self) -> None:
|
||||
"""Create necessary workspace directories."""
|
||||
self.work_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug(f"Workspace initialised at: {self.work_dir}")
|
||||
|
||||
def get_model_dir(self, model_name: str) -> Path:
|
||||
"""Get directory path for a specific model.
|
||||
|
||||
Creates the model directory if it doesn't exist and returns the path
|
||||
for storing model files and quantisation outputs.
|
||||
|
||||
Returns:
|
||||
Path to model directory.
|
||||
"""
|
||||
model_dir = self.models_dir / model_name
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
return model_dir
|
||||
|
||||
def get_temp_dir(self, prefix: str = "temp") -> Path:
|
||||
"""Get a temporary directory path within workspace.
|
||||
|
||||
Creates a unique temporary directory with specified prefix within
|
||||
the workspace for intermediate processing files.
|
||||
|
||||
Returns:
|
||||
Path to temporary directory.
|
||||
"""
|
||||
return Path(tempfile.mkdtemp(prefix=f"{prefix}_", dir=self.work_dir))
|
||||
|
||||
def get_imatrix_dir(self, model_name: str) -> Path:
|
||||
"""Get directory for importance matrix files.
|
||||
|
||||
Creates and returns the path to the imatrix directory for storing
|
||||
importance matrices used in advanced quantisation methods.
|
||||
|
||||
Returns:
|
||||
Path to imatrix directory.
|
||||
"""
|
||||
imatrix_dir = self.models_dir / model_name / "imatrix"
|
||||
imatrix_dir.mkdir(parents=True, exist_ok=True)
|
||||
return imatrix_dir
|
||||
|
||||
def get_quantisation_output_path(
|
||||
self,
|
||||
model_name: str,
|
||||
author: str,
|
||||
quant_type: str,
|
||||
) -> Path:
|
||||
"""Get output path for a quantised model.
|
||||
|
||||
Constructs standardised filename and path for quantised model output
|
||||
using author-model-quantisation format for consistent naming.
|
||||
|
||||
Returns:
|
||||
Path for quantised model output.
|
||||
"""
|
||||
model_dir = self.get_model_dir(model_name)
|
||||
filename = f"{author}-{model_name}-{quant_type}.gguf"
|
||||
return model_dir / filename
|
||||
|
||||
def cleanup_workspace(self) -> None:
|
||||
"""Clean up entire workspace directory."""
|
||||
if self.work_dir.exists():
|
||||
logger.info(f"Cleaning up workspace: {self.work_dir}")
|
||||
shutil_rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
@property
|
||||
def disk_usage(self) -> dict[str, float]:
|
||||
"""Get disk usage statistics for workspace.
|
||||
|
||||
Returns:
|
||||
Dictionary with size in GB for work_dir and models_dir.
|
||||
"""
|
||||
|
||||
def get_dir_size(path: Path) -> float:
|
||||
"""Calculate total size of directory in GB.
|
||||
|
||||
Recursively traverses directory tree to calculate total file
|
||||
sizes with GB conversion for human-readable output.
|
||||
|
||||
Returns:
|
||||
Total size of directory in GB.
|
||||
"""
|
||||
total = 0
|
||||
if path.exists():
|
||||
for item in path.rglob("*"):
|
||||
if item.is_file():
|
||||
total += item.stat().st_size
|
||||
return total / (1024**3) # Convert to GB
|
||||
|
||||
return {
|
||||
"work_dir": get_dir_size(self.work_dir),
|
||||
"models_dir": get_dir_size(self.models_dir),
|
||||
}
|
||||
|
||||
def validate_space(self, required_gb: float = 50.0) -> bool:
|
||||
"""Check if sufficient disk space is available.
|
||||
|
||||
Validates available disk space against required threshold, logging
|
||||
warnings when space is insufficient for quantisation operations.
|
||||
|
||||
Returns:
|
||||
True if sufficient space available.
|
||||
"""
|
||||
stat = shutil_disk_usage(self.work_dir)
|
||||
free_gb = stat.free / (1024**3)
|
||||
|
||||
if free_gb < required_gb:
|
||||
logger.warning(f"Low disk space: {free_gb:.1f}GB free, {required_gb:.1f}GB recommended")
|
||||
return False
|
||||
return True
|
11
helpers/ggml/__init__.py
Normal file
11
helpers/ggml/__init__.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
"""GGML quantisation operations.
|
||||
|
||||
Provides numpy-based GGML block quantisation for architectures
|
||||
not supported by llama.cpp.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.ggml.quantiser import GGMLQuantiser
|
||||
|
||||
__all__ = ["GGMLQuantiser"]
|
574
helpers/ggml/quantiser.py
Normal file
574
helpers/ggml/quantiser.py
Normal file
|
@ -0,0 +1,574 @@
|
|||
"""GGML block quantisation for unsupported architectures.
|
||||
|
||||
Implements proper GGML quantisation formats (Q4_0, Q5_0, Q8_0) using numpy,
|
||||
following the exact specifications from ggml. This allows quantisation of
|
||||
models with architectures not yet supported by llama.cpp.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import struct
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import gguf
|
||||
import numpy as np
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# GGML block sizes for different quantisation types
|
||||
QK4_0 = 32 # Block size for Q4_0
|
||||
QK5_0 = 32 # Block size for Q5_0
|
||||
QK5_1 = 32 # Block size for Q5_1
|
||||
QK8_0 = 32 # Block size for Q8_0
|
||||
|
||||
|
||||
class GGMLQuantiser:
|
||||
"""Implements GGML quantisation formats for architecture-agnostic models.
|
||||
|
||||
Provides proper GGML block quantisation using numpy, following the exact
|
||||
format specifications. This enables Q4_0, Q5_0, and Q8_0 quantisation
|
||||
for models with unsupported architectures.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise GGML quantiser."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def get_supported_types(self) -> list[str]:
|
||||
"""Get supported basic quantisation types.
|
||||
|
||||
Returns:
|
||||
List of supported quantisation type strings.
|
||||
"""
|
||||
return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
||||
|
||||
def _extract_architecture_string(self, arch_field: Any) -> str:
|
||||
"""Extract architecture string from GGUF field data.
|
||||
|
||||
Handles various formats of architecture field storage in GGUF files.
|
||||
|
||||
Returns:
|
||||
Architecture string or 'unknown' if extraction fails.
|
||||
"""
|
||||
if not arch_field:
|
||||
return "unknown"
|
||||
|
||||
if hasattr(arch_field, "parts") and arch_field.parts:
|
||||
return self._extract_from_parts_array(arch_field)
|
||||
if hasattr(arch_field, "data"):
|
||||
return self._extract_from_data_field(arch_field.data)
|
||||
|
||||
return "unknown"
|
||||
|
||||
def _extract_from_parts_array(self, arch_field: Any) -> str:
|
||||
"""Extract architecture from GGUF parts array format.
|
||||
|
||||
Returns:
|
||||
Architecture string or 'unknown' if extraction fails.
|
||||
"""
|
||||
if len(arch_field.data) == 0:
|
||||
return "unknown"
|
||||
|
||||
idx = arch_field.data[0] if isinstance(arch_field.data, (list, tuple)) else arch_field.data
|
||||
|
||||
if idx >= len(arch_field.parts):
|
||||
return "unknown"
|
||||
|
||||
return self._decode_part(arch_field.parts[idx])
|
||||
|
||||
def _decode_part(self, arch_part: Any) -> str:
|
||||
"""Decode architecture part to string.
|
||||
|
||||
Returns:
|
||||
Decoded string representation.
|
||||
"""
|
||||
if isinstance(arch_part, bytes):
|
||||
return arch_part.decode("utf-8")
|
||||
if isinstance(arch_part, str):
|
||||
return arch_part
|
||||
if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
|
||||
# Handle nested format
|
||||
if isinstance(arch_part[0], bytes):
|
||||
return arch_part[0].decode("utf-8")
|
||||
return str(arch_part[0])
|
||||
return str(arch_part)
|
||||
|
||||
def _extract_from_data_field(self, data: Any) -> str:
|
||||
"""Extract architecture from GGUF data field.
|
||||
|
||||
Returns:
|
||||
Architecture string or 'unknown' if extraction fails.
|
||||
"""
|
||||
if isinstance(data, np.ndarray):
|
||||
# It's a numpy array of bytes - convert to string
|
||||
try:
|
||||
return bytes(data).decode("utf-8")
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
# If that fails, try converting as ASCII values
|
||||
return "".join(chr(c) for c in data if c < 128)
|
||||
elif isinstance(data, bytes):
|
||||
return data.decode("utf-8")
|
||||
elif isinstance(data, str):
|
||||
return data
|
||||
else:
|
||||
return str(data)
|
||||
|
||||
def _copy_metadata_fields(self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter) -> None:
|
||||
"""Copy metadata fields from reader to writer, excluding file type."""
|
||||
logger.info("📋 Copying metadata...")
|
||||
|
||||
for key, field in reader.fields.items():
|
||||
# Skip the file type field - we'll set our own
|
||||
if key == "general.file_type":
|
||||
continue
|
||||
|
||||
# Handle different field types
|
||||
if field.types:
|
||||
field_type = field.types[0]
|
||||
field_data = field.parts[field.data[0]] if field.parts else field.data
|
||||
|
||||
self._copy_field_by_type(writer, key, field_type, field_data, field)
|
||||
|
||||
def _copy_field_by_type(
|
||||
self,
|
||||
writer: gguf.GGUFWriter,
|
||||
key: str,
|
||||
field_type: gguf.GGUFValueType,
|
||||
field_data: Any,
|
||||
field: Any,
|
||||
) -> None:
|
||||
"""Copy a single field based on its type."""
|
||||
if field_type == gguf.GGUFValueType.STRING:
|
||||
# Handle both bytes and string types
|
||||
string_val = field_data[0]
|
||||
if isinstance(string_val, bytes):
|
||||
string_val = string_val.decode("utf-8")
|
||||
elif isinstance(string_val, int):
|
||||
string_val = str(string_val)
|
||||
writer.add_string(key, string_val)
|
||||
elif field_type == gguf.GGUFValueType.UINT32:
|
||||
writer.add_uint32(key, int(field.data[0]))
|
||||
elif field_type == gguf.GGUFValueType.FLOAT32:
|
||||
writer.add_float32(key, float(field.data[0]))
|
||||
elif field_type == gguf.GGUFValueType.BOOL:
|
||||
writer.add_bool(key, bool(field.data[0]))
|
||||
elif field_type == gguf.GGUFValueType.ARRAY:
|
||||
writer.add_array(key, field.data)
|
||||
else:
|
||||
# Skip unsupported field types for now
|
||||
# Future enhancement: Handle additional GGUF field types as needed
|
||||
pass
|
||||
|
||||
def _get_file_type_mapping(self) -> dict[str, gguf.GGMLQuantizationType]:
|
||||
"""Get mapping from quantisation type strings to GGML enums.
|
||||
|
||||
Returns:
|
||||
Mapping from quantisation type strings to GGML enums.
|
||||
"""
|
||||
return {
|
||||
"Q4_0": gguf.GGMLQuantizationType.Q4_0,
|
||||
"Q5_0": gguf.GGMLQuantizationType.Q5_0,
|
||||
"Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum
|
||||
"Q8_0": gguf.GGMLQuantizationType.Q8_0,
|
||||
}
|
||||
|
||||
def _process_tensor_list(
|
||||
self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter, quant_type: str
|
||||
) -> None:
|
||||
"""Process all tensors for quantisation."""
|
||||
logger.info(f"🔄 Quantising {len(reader.tensors)} tensors to {quant_type}...")
|
||||
|
||||
for i, tensor in enumerate(reader.tensors):
|
||||
if i % 50 == 0:
|
||||
logger.info(f" Processing tensor {i}/{len(reader.tensors)}...")
|
||||
|
||||
self._process_single_tensor(tensor, writer, quant_type)
|
||||
|
||||
def _process_single_tensor(self, tensor: Any, writer: gguf.GGUFWriter, quant_type: str) -> None:
|
||||
"""Process a single tensor for quantisation or preserve as-is."""
|
||||
# Get tensor info
|
||||
name = tensor.name
|
||||
shape = list(tensor.shape)
|
||||
data = tensor.data
|
||||
|
||||
# Determine if this tensor should be quantised
|
||||
should_quantise = self._should_quantise_tensor(name)
|
||||
|
||||
if not should_quantise:
|
||||
# Keep original format
|
||||
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
|
||||
else:
|
||||
# Quantise the tensor
|
||||
try:
|
||||
quantised_data, quant_dtype = self._quantise_tensor(
|
||||
data, tensor.tensor_type, shape, quant_type
|
||||
)
|
||||
writer.add_tensor(name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype)
|
||||
except ValueError as e:
|
||||
# If quantization fails due to shape issues, keep original
|
||||
logger.warning(f" ⚠️ Cannot quantise {name}: {e}")
|
||||
logger.warning(" Keeping in original format")
|
||||
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
|
||||
|
||||
def _write_output_file(self, writer: gguf.GGUFWriter, output_path: Path) -> bool:
|
||||
"""Write the final GGUF file and verify creation.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
logger.info(f"💾 Writing {output_path.name}...")
|
||||
writer.write_header_to_file()
|
||||
writer.write_kv_data_to_file()
|
||||
writer.write_tensors_to_file()
|
||||
writer.close()
|
||||
|
||||
if output_path.exists():
|
||||
file_size = self.fs.get_file_size(output_path)
|
||||
logger.info(f"✅ GGML quantisation complete: {file_size}")
|
||||
return True
|
||||
logger.error("❌ Output file was not created")
|
||||
return False
|
||||
|
||||
def quantise_basic(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
quant_type: str,
|
||||
) -> bool:
|
||||
"""Perform GGML block quantisation on a GGUF file.
|
||||
|
||||
Reads a GGUF file, quantises all tensors using the specified
|
||||
quantisation type, and writes a new GGUF file. Implements proper
|
||||
GGML block formats for architecture-agnostic quantisation.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
if quant_type not in self.get_supported_types():
|
||||
logger.error(f"Unsupported quantisation type: {quant_type}")
|
||||
return False
|
||||
|
||||
logger.info(f"🔧 Starting GGML {quant_type} quantisation...")
|
||||
logger.info("📝 This uses numpy-based block quantisation")
|
||||
|
||||
try:
|
||||
# Read input GGUF
|
||||
logger.info(f"📖 Reading {input_path.name}...")
|
||||
reader = gguf.GGUFReader(str(input_path))
|
||||
|
||||
# Create output writer with same architecture
|
||||
arch_field = reader.fields.get("general.architecture")
|
||||
arch_str = self._extract_architecture_string(arch_field)
|
||||
|
||||
logger.info(f"📝 Architecture: {arch_str}")
|
||||
writer = gguf.GGUFWriter(str(output_path), arch_str)
|
||||
|
||||
# Copy all metadata
|
||||
self._copy_metadata_fields(reader, writer)
|
||||
|
||||
# Set file type based on quantisation
|
||||
file_type_map = self._get_file_type_mapping()
|
||||
writer.add_file_type(file_type_map[quant_type])
|
||||
|
||||
# Process tensors
|
||||
self._process_tensor_list(reader, writer, quant_type)
|
||||
|
||||
# Write the output file
|
||||
return self._write_output_file(writer, output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ GGML quantisation failed: {e}\n{traceback.format_exc()}")
|
||||
return False
|
||||
|
||||
def _should_quantise_tensor(self, tensor_name: str) -> bool:
|
||||
"""Determine if a tensor should be quantised.
|
||||
|
||||
Some tensors like token embeddings should typically remain in
|
||||
higher precision for quality.
|
||||
|
||||
Returns:
|
||||
True if the tensor should be quantised, False otherwise
|
||||
"""
|
||||
# Keep token embeddings and output layers in original precision
|
||||
# These patterns cover most architectures
|
||||
keep_original = [
|
||||
"token_embd",
|
||||
"output.weight",
|
||||
"lm_head",
|
||||
"embed_tokens",
|
||||
"word_embeddings",
|
||||
]
|
||||
|
||||
for pattern in keep_original:
|
||||
if pattern in tensor_name:
|
||||
logger.debug(f" Keeping {tensor_name} in original format")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _quantise_tensor(
|
||||
self,
|
||||
data: np.ndarray,
|
||||
dtype: gguf.GGMLQuantizationType,
|
||||
shape: list[int],
|
||||
quant_type: str,
|
||||
) -> tuple[np.ndarray, gguf.GGMLQuantizationType]:
|
||||
"""Quantise a tensor using GGML block quantisation.
|
||||
|
||||
Returns:
|
||||
Tuple of (quantised_data, new_dtype)
|
||||
"""
|
||||
# Work directly with numpy array - convert to float32 if needed
|
||||
if dtype in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}:
|
||||
arr = data.astype(np.float32)
|
||||
else:
|
||||
# Already quantised or unknown type - return as-is
|
||||
return data, dtype
|
||||
|
||||
# Reshape to original shape
|
||||
arr = arr.reshape(shape)
|
||||
|
||||
# Flatten for processing
|
||||
arr_flat = arr.flatten()
|
||||
|
||||
# Apply quantisation
|
||||
if quant_type == "Q8_0":
|
||||
quantised = self._quantise_q8_0(arr_flat)
|
||||
new_dtype = gguf.GGMLQuantizationType.Q8_0
|
||||
elif quant_type == "Q6_0":
|
||||
quantised = self._quantise_q6_0(arr_flat)
|
||||
new_dtype = gguf.GGMLQuantizationType.Q6_K # Q6_0 uses Q6_K enum
|
||||
elif quant_type == "Q5_0":
|
||||
quantised = self._quantise_q5_0(arr_flat)
|
||||
new_dtype = gguf.GGMLQuantizationType.Q5_0
|
||||
elif quant_type == "Q4_0":
|
||||
quantised = self._quantise_q4_0(arr_flat)
|
||||
new_dtype = gguf.GGMLQuantizationType.Q4_0
|
||||
else:
|
||||
# Unsupported - return original
|
||||
return data, dtype
|
||||
|
||||
# Convert bytes back to numpy array for gguf writer
|
||||
return np.frombuffer(quantised, dtype=np.uint8), new_dtype
|
||||
|
||||
def _quantise_q8_0(self, arr: np.ndarray) -> bytes:
|
||||
"""Quantise to Q8_0 format.
|
||||
|
||||
Q8_0: Blocks of 32 values, each block has:
|
||||
- 1 float16 scale factor (2 bytes)
|
||||
- 32 int8 values (32 bytes)
|
||||
Total: 34 bytes per 32 values
|
||||
|
||||
Returns:
|
||||
Bytes of the quantised data
|
||||
"""
|
||||
n = len(arr)
|
||||
nb = (n + QK8_0 - 1) // QK8_0 # Number of blocks
|
||||
|
||||
output = bytearray()
|
||||
|
||||
for i in range(nb):
|
||||
# Get block of values
|
||||
start = i * QK8_0
|
||||
end = min(start + QK8_0, n)
|
||||
block = arr[start:end]
|
||||
|
||||
# Pad if needed
|
||||
if len(block) < QK8_0:
|
||||
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
|
||||
|
||||
# Calculate scale
|
||||
amax = np.abs(block).max()
|
||||
scale = amax / 127.0 if amax > 0 else 1.0
|
||||
|
||||
# Quantise
|
||||
quantised = np.round(block / scale).astype(np.int8)
|
||||
quantised = np.clip(quantised, -128, 127)
|
||||
|
||||
output.extend(struct.pack("e", scale)) # 'e' is float16
|
||||
output.extend(quantised.tobytes())
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def _quantise_q6_0(self, arr: np.ndarray) -> bytes:
|
||||
"""Quantise to Q6_0 format.
|
||||
|
||||
Q6_0: Blocks of 32 values with 6-bit quantisation
|
||||
- 1 float16 scale (2 bytes)
|
||||
- 1 float16 min value (2 bytes)
|
||||
- 24 bytes of packed 6-bit values (32 values * 6 bits = 192 bits = 24 bytes)
|
||||
Total: 28 bytes per 32 values
|
||||
|
||||
Returns:
|
||||
Bytes of the quantised data
|
||||
"""
|
||||
n = len(arr)
|
||||
nb = (n + QK8_0 - 1) // QK8_0 # Use same block size as Q8_0
|
||||
|
||||
output = bytearray()
|
||||
|
||||
for i in range(nb):
|
||||
# Get block
|
||||
start = i * QK8_0
|
||||
end = min(start + QK8_0, n)
|
||||
block = arr[start:end]
|
||||
|
||||
# Pad if needed
|
||||
if len(block) < QK8_0:
|
||||
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
|
||||
|
||||
# Calculate scale and min
|
||||
vmin = block.min()
|
||||
vmax = block.max()
|
||||
scale = (vmax - vmin) / 63.0 if vmax > vmin else 1.0
|
||||
|
||||
# Quantise to 6-bit (0-63)
|
||||
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
||||
quantised = np.clip(quantised, 0, 63)
|
||||
|
||||
# Pack scale and min
|
||||
output.extend(struct.pack("e", scale))
|
||||
output.extend(struct.pack("e", vmin))
|
||||
|
||||
# Pack 6-bit values (simplified - using 1 byte per value)
|
||||
# Proper implementation would pack 4 values into 3 bytes
|
||||
for q in quantised:
|
||||
output.append(q)
|
||||
|
||||
# Pad to expected size
|
||||
while len(output) % 28 != 0:
|
||||
output.append(0)
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def _quantise_q5_0(self, arr: np.ndarray) -> bytes:
|
||||
"""Quantise to Q5_0 format.
|
||||
|
||||
Q5_0: Blocks of 32 values with 5-bit quantisation
|
||||
- 1 float16 scale (2 bytes)
|
||||
- 1 float16 min value (2 bytes)
|
||||
- 20 bytes of packed 5-bit values (32 values * 5 bits = 160 bits = 20 bytes)
|
||||
Total: 24 bytes per 32 values
|
||||
|
||||
Returns:
|
||||
Bytes of the quantised data
|
||||
"""
|
||||
n = len(arr)
|
||||
nb = (n + QK5_0 - 1) // QK5_0
|
||||
|
||||
output = bytearray()
|
||||
|
||||
for i in range(nb):
|
||||
# Get block
|
||||
start = i * QK5_0
|
||||
end = min(start + QK5_0, n)
|
||||
block = arr[start:end]
|
||||
|
||||
# Pad if needed
|
||||
if len(block) < QK5_0:
|
||||
block = np.pad(block, (0, QK5_0 - len(block)), mode="constant")
|
||||
|
||||
# Calculate scale and min
|
||||
vmin = block.min()
|
||||
vmax = block.max()
|
||||
scale = (vmax - vmin) / 31.0 if vmax > vmin else 1.0
|
||||
|
||||
# Quantise to 5-bit (0-31)
|
||||
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
||||
quantised = np.clip(quantised, 0, 31)
|
||||
|
||||
# Pack scale and min
|
||||
output.extend(struct.pack("e", scale))
|
||||
output.extend(struct.pack("e", vmin))
|
||||
|
||||
# Pack 5-bit values (simplified packing - not optimal but functional)
|
||||
# For simplicity, use 1 byte per value (wasting 3 bits each)
|
||||
# Proper implementation would pack 8 values into 5 bytes
|
||||
for q in quantised:
|
||||
output.append(q)
|
||||
|
||||
# Pad to expected size
|
||||
while len(output) % 24 != 0:
|
||||
output.append(0)
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def _quantise_q4_0(self, arr: np.ndarray) -> bytes:
|
||||
"""Quantise to Q4_0 format.
|
||||
|
||||
Q4_0: Blocks of 32 values with 4-bit quantisation
|
||||
- 1 float16 scale (2 bytes)
|
||||
- 1 float16 min value (2 bytes)
|
||||
- 16 bytes of packed 4-bit values (32 values * 4 bits = 128 bits = 16 bytes)
|
||||
Total: 20 bytes per 32 values
|
||||
|
||||
Returns:
|
||||
Bytes of the quantised data
|
||||
"""
|
||||
n = len(arr)
|
||||
nb = (n + QK4_0 - 1) // QK4_0
|
||||
|
||||
output = bytearray()
|
||||
|
||||
for i in range(nb):
|
||||
# Get block
|
||||
start = i * QK4_0
|
||||
end = min(start + QK4_0, n)
|
||||
block = arr[start:end]
|
||||
|
||||
# Pad if needed
|
||||
if len(block) < QK4_0:
|
||||
block = np.pad(block, (0, QK4_0 - len(block)), mode="constant")
|
||||
|
||||
# Calculate scale and min
|
||||
vmin = block.min()
|
||||
vmax = block.max()
|
||||
scale = (vmax - vmin) / 15.0 if vmax > vmin else 1.0
|
||||
|
||||
# Quantise to 4-bit (0-15)
|
||||
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
||||
quantised = np.clip(quantised, 0, 15)
|
||||
|
||||
# Pack scale and min
|
||||
output.extend(struct.pack("e", scale))
|
||||
output.extend(struct.pack("e", vmin))
|
||||
|
||||
# Pack 4-bit values - 2 values per byte
|
||||
for j in range(0, 32, 2):
|
||||
packed = (quantised[j] & 0xF) | ((quantised[j + 1] & 0xF) << 4)
|
||||
output.append(packed)
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def try_alternative_quantisation(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
target_type: str,
|
||||
) -> bool:
|
||||
"""Try basic quantisation for unsupported architectures.
|
||||
|
||||
For architectures not supported by llama.cpp, uses GGML implementation
|
||||
to provide basic quantisation formats as fallback. Handles only basic
|
||||
types that can be generated with numpy-based GGML quantisation.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
# Only handle basic types that we can generate with GGML
|
||||
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
||||
|
||||
if target_type in basic_types:
|
||||
logger.info(f"📝 Using GGML numpy implementation for {target_type}")
|
||||
return self.quantise_basic(input_path, output_path, target_type)
|
||||
|
||||
# For K-quants on unsupported architectures, we can't provide a direct equivalent
|
||||
logger.error(f"❌ Cannot quantise {target_type} for unsupported architecture")
|
||||
logger.info("💡 Consider using Q4_0, Q5_0, Q6_0, or Q8_0 instead")
|
||||
return False
|
12
helpers/gguf/__init__.py
Normal file
12
helpers/gguf/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
"""GGUF file operations.
|
||||
|
||||
Provides reading, writing, and conversion utilities for GGUF format files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.gguf.converter import GGUFConverter
|
||||
from helpers.gguf.reader import GGUFReader
|
||||
from helpers.gguf.writer import GGUFWriter
|
||||
|
||||
__all__ = ["GGUFConverter", "GGUFReader", "GGUFWriter"]
|
216
helpers/gguf/converter.py
Normal file
216
helpers/gguf/converter.py
Normal file
|
@ -0,0 +1,216 @@
|
|||
"""SafeTensors to GGUF conversion.
|
||||
|
||||
Handles conversion of SafeTensors models to GGUF format with proper
|
||||
metadata and tensor mapping.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import json
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.gguf.writer import GGUFWriter
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.conversion import ModelConfig
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
|
||||
class GGUFConverter:
|
||||
"""High-level GGUF conversion orchestrator.
|
||||
|
||||
Coordinates the complete conversion workflow from source models to GGUF
|
||||
format, managing metadata extraction, tensor mapping, and file writing.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def convert_safetensors(
|
||||
model_path: Path,
|
||||
output_path: Path,
|
||||
model_config: ModelConfig,
|
||||
architecture: str,
|
||||
tensor_mapper: TensorMapper,
|
||||
) -> bool:
|
||||
"""Convert SafeTensors model to GGUF format.
|
||||
|
||||
Orchestrates the conversion process including metadata setup, tensor
|
||||
loading with BFloat16 support, name mapping, and tokeniser integration.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"Converting {model_path.name} to GGUF...")
|
||||
|
||||
# Create writer
|
||||
writer_wrapper = GGUFWriter(output_path, architecture)
|
||||
|
||||
# Add metadata
|
||||
writer_wrapper.add_metadata(model_config, model_path.name)
|
||||
|
||||
# Add vision metadata if present
|
||||
if model_config.vision_config:
|
||||
writer_wrapper.add_vision_metadata(model_config.vision_config)
|
||||
|
||||
# Load and add tensors
|
||||
fs = FilesystemService()
|
||||
tensor_files = fs.find_safetensor_files(model_path)
|
||||
logger.info(f"Found {len(tensor_files)} tensor file(s)")
|
||||
|
||||
tensor_count = 0
|
||||
for tensor_file in tensor_files:
|
||||
logger.info(f"Loading {tensor_file.name}...")
|
||||
with safe_open(tensor_file, framework="pt") as f:
|
||||
for tensor_name in f.keys(): # noqa: SIM118
|
||||
tensor_data = f.get_tensor(tensor_name)
|
||||
|
||||
# Convert BFloat16 to Float32
|
||||
if hasattr(tensor_data, "numpy"):
|
||||
if torch and tensor_data.dtype == torch.bfloat16:
|
||||
tensor_data = tensor_data.float()
|
||||
numpy_data = tensor_data.numpy()
|
||||
else:
|
||||
# Already numpy
|
||||
numpy_data = tensor_data
|
||||
|
||||
# Map tensor name
|
||||
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
|
||||
if not gguf_name:
|
||||
logger.debug(f"Skipping unmapped tensor: {tensor_name}")
|
||||
continue
|
||||
|
||||
logger.debug(f" {tensor_name} -> {gguf_name}")
|
||||
writer_wrapper.add_tensor(gguf_name, numpy_data)
|
||||
tensor_count += 1
|
||||
|
||||
# Clean up memory after each file
|
||||
gc.collect()
|
||||
if torch and torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
logger.info(f"Added {tensor_count} tensors")
|
||||
|
||||
# Add tokeniser
|
||||
tokeniser_config = GGUFConverter.load_tokeniser_config(model_path)
|
||||
if tokeniser_config:
|
||||
writer_wrapper.add_tokeniser(tokeniser_config)
|
||||
writer_wrapper.add_tokeniser_vocabulary(model_path)
|
||||
|
||||
# Finalise and write
|
||||
writer_wrapper.write()
|
||||
|
||||
# Clean up
|
||||
del writer_wrapper
|
||||
gc.collect()
|
||||
|
||||
return output_path.exists()
|
||||
|
||||
@staticmethod
|
||||
def convert_pytorch(
|
||||
model_path: Path,
|
||||
output_path: Path,
|
||||
model_config: ModelConfig,
|
||||
architecture: str,
|
||||
tensor_mapper: TensorMapper,
|
||||
) -> bool:
|
||||
"""Convert PyTorch model to GGUF format.
|
||||
|
||||
Handles PyTorch bin file conversion with sharded model support,
|
||||
BFloat16 compatibility, and proper memory management.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"Converting {model_path.name} to GGUF...")
|
||||
|
||||
# Create writer
|
||||
writer_wrapper = GGUFWriter(output_path, architecture)
|
||||
|
||||
# Add metadata
|
||||
writer_wrapper.add_metadata(model_config, model_path.name)
|
||||
|
||||
# Load and add tensors
|
||||
fs = FilesystemService()
|
||||
model_files = fs.find_safetensor_files(model_path)
|
||||
logger.info(f"Found {len(model_files)} model file(s)")
|
||||
|
||||
tensor_count = 0
|
||||
for model_file in model_files:
|
||||
logger.info(f"Loading {model_file.name}...")
|
||||
try:
|
||||
checkpoint = torch.load(model_file, map_location="cpu", weights_only=True)
|
||||
|
||||
for tensor_name, tensor_data in checkpoint.items():
|
||||
# Convert to numpy
|
||||
if hasattr(tensor_data, "numpy"):
|
||||
if tensor_data.dtype == torch.bfloat16:
|
||||
converted_tensor = tensor_data.float()
|
||||
else:
|
||||
converted_tensor = tensor_data
|
||||
numpy_data = converted_tensor.numpy()
|
||||
else:
|
||||
numpy_data = tensor_data
|
||||
|
||||
# Map tensor name
|
||||
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
|
||||
if not gguf_name:
|
||||
logger.debug(f"Skipping unmapped tensor: {tensor_name}")
|
||||
continue
|
||||
|
||||
logger.debug(f" {tensor_name} -> {gguf_name}")
|
||||
writer_wrapper.add_tensor(gguf_name, numpy_data)
|
||||
tensor_count += 1
|
||||
|
||||
# Clean up checkpoint
|
||||
del checkpoint
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load {model_file.name}: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return False
|
||||
|
||||
logger.info(f"Added {tensor_count} tensors")
|
||||
|
||||
# Add tokeniser
|
||||
tokeniser_config = GGUFConverter.load_tokeniser_config(model_path)
|
||||
if tokeniser_config:
|
||||
writer_wrapper.add_tokeniser(tokeniser_config)
|
||||
writer_wrapper.add_tokeniser_vocabulary(model_path)
|
||||
|
||||
# Finalise and write
|
||||
writer_wrapper.write()
|
||||
|
||||
# Clean up
|
||||
del writer_wrapper
|
||||
gc.collect()
|
||||
|
||||
return output_path.exists()
|
||||
|
||||
@staticmethod
|
||||
def load_tokeniser_config(model_path: Path) -> dict[str, Any] | None:
|
||||
"""Load tokeniser configuration from model directory.
|
||||
|
||||
Returns:
|
||||
Tokeniser configuration dictionary or None if not found.
|
||||
"""
|
||||
config_path = model_path / "tokenizer_config.json"
|
||||
if not config_path.exists():
|
||||
logger.warning("tokenizer_config.json not found")
|
||||
return None
|
||||
|
||||
try:
|
||||
with Path(config_path).open(encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load tokeniser config: {e}")
|
||||
return None
|
231
helpers/gguf/reader.py
Normal file
231
helpers/gguf/reader.py
Normal file
|
@ -0,0 +1,231 @@
|
|||
"""GGUF file reading operations.
|
||||
|
||||
Provides utilities for reading and extracting information from GGUF files.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import gguf
|
||||
import numpy as np
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class GGUFReader:
|
||||
"""Reads and extracts information from GGUF files.
|
||||
|
||||
Provides methods to read metadata, architecture information, and tensors
|
||||
from existing GGUF files for inspection or re-quantisation.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: Path) -> None:
|
||||
"""Initialise GGUF reader with file path.
|
||||
|
||||
Sets up the internal GGUF reader instance for subsequent metadata
|
||||
and tensor extraction operations on the specified file.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.reader = gguf.GGUFReader(str(file_path))
|
||||
|
||||
def get_architecture(self) -> str:
|
||||
"""Extract architecture string from GGUF file.
|
||||
|
||||
Returns:
|
||||
Architecture string or "unknown" if not found.
|
||||
"""
|
||||
arch = self.reader.fields.get("general.architecture")
|
||||
if not arch:
|
||||
return "unknown"
|
||||
|
||||
# Try extracting from parts array format
|
||||
if hasattr(arch, "parts") and arch.parts:
|
||||
return self._extract_from_parts(arch)
|
||||
|
||||
# Try extracting from data field directly
|
||||
if hasattr(arch, "data"):
|
||||
return self._extract_from_data(arch.data)
|
||||
|
||||
return "unknown"
|
||||
|
||||
def _extract_from_parts(self, arch: Any) -> str:
|
||||
"""Extract architecture from parts array.
|
||||
|
||||
Returns:
|
||||
Architecture string or "unknown".
|
||||
"""
|
||||
if len(arch.data) == 0:
|
||||
return "unknown"
|
||||
|
||||
# Get index and validate
|
||||
idx = arch.data[0] if isinstance(arch.data, (list, tuple)) else arch.data
|
||||
if idx >= len(arch.parts):
|
||||
return "unknown"
|
||||
|
||||
return self._decode_arch_part(arch.parts[idx])
|
||||
|
||||
def _decode_arch_part(self, arch_part: Any) -> str:
|
||||
"""Decode architecture part to string.
|
||||
|
||||
Returns:
|
||||
Decoded architecture string.
|
||||
"""
|
||||
if isinstance(arch_part, bytes):
|
||||
return arch_part.decode("utf-8")
|
||||
if isinstance(arch_part, str):
|
||||
return arch_part
|
||||
if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
|
||||
# Handle nested format
|
||||
if isinstance(arch_part[0], bytes):
|
||||
return arch_part[0].decode("utf-8")
|
||||
return str(arch_part[0])
|
||||
return str(arch_part)
|
||||
|
||||
def _extract_from_data(self, data: Any) -> str:
|
||||
"""Extract architecture from data field.
|
||||
|
||||
Returns:
|
||||
Architecture string or "unknown".
|
||||
"""
|
||||
if isinstance(data, np.ndarray):
|
||||
# Convert numpy array of bytes to string
|
||||
try:
|
||||
return bytes(data).decode("utf-8")
|
||||
except (UnicodeDecodeError, ValueError):
|
||||
# Fallback to ASCII conversion
|
||||
return "".join(chr(c) for c in data if c < 128)
|
||||
if isinstance(data, bytes):
|
||||
return data.decode("utf-8")
|
||||
if isinstance(data, str):
|
||||
return data
|
||||
return str(data)
|
||||
|
||||
def get_metadata(self) -> dict[str, Any]:
|
||||
"""Extract all metadata from GGUF file.
|
||||
|
||||
Returns:
|
||||
Dictionary of metadata fields and values.
|
||||
"""
|
||||
metadata: dict[str, Any] = {}
|
||||
|
||||
for key, field in self.reader.fields.items():
|
||||
if field.types and field.data:
|
||||
field_type = field.types[0]
|
||||
field_data = field.parts[field.data[0]] if field.parts else field.data
|
||||
|
||||
# Convert data based on type
|
||||
if field_type == gguf.GGUFValueType.STRING:
|
||||
if isinstance(field_data, (list, tuple)) and field_data:
|
||||
string_value = field_data[0]
|
||||
if isinstance(string_value, bytes):
|
||||
string_value = string_value.decode("utf-8")
|
||||
metadata[key] = string_value
|
||||
else:
|
||||
metadata[key] = str(field_data)
|
||||
elif field_type in {
|
||||
gguf.GGUFValueType.UINT32,
|
||||
gguf.GGUFValueType.INT32,
|
||||
gguf.GGUFValueType.FLOAT32,
|
||||
gguf.GGUFValueType.BOOL,
|
||||
}:
|
||||
metadata[key] = (
|
||||
field.data[0] if isinstance(field.data, (list, tuple)) else field.data
|
||||
)
|
||||
elif field_type == gguf.GGUFValueType.ARRAY:
|
||||
metadata[key] = list(field.data)
|
||||
|
||||
return metadata
|
||||
|
||||
def get_tensor_info(self) -> list[dict[str, Any]]:
|
||||
"""Get information about all tensors in the file.
|
||||
|
||||
Returns:
|
||||
List of tensor info dictionaries with name, shape, and type.
|
||||
"""
|
||||
tensor_info = []
|
||||
|
||||
for tensor in self.reader.tensors:
|
||||
info = {
|
||||
"name": tensor.name,
|
||||
"shape": list(tensor.shape),
|
||||
"type": tensor.tensor_type.name
|
||||
if hasattr(tensor.tensor_type, "name")
|
||||
else str(tensor.tensor_type),
|
||||
"size_bytes": tensor.data.nbytes
|
||||
if hasattr(tensor.data, "nbytes")
|
||||
else len(tensor.data),
|
||||
}
|
||||
tensor_info.append(info)
|
||||
|
||||
return tensor_info
|
||||
|
||||
def get_quantisation_type(self) -> str | None:
|
||||
"""Get the quantisation type of the GGUF file.
|
||||
|
||||
Returns:
|
||||
Quantisation type string or None if not found.
|
||||
"""
|
||||
file_type = self.reader.fields.get("general.file_type")
|
||||
|
||||
if file_type and hasattr(file_type, "data"):
|
||||
# Map numeric file type to string
|
||||
file_type_value = (
|
||||
file_type.data[0] if isinstance(file_type.data, (list, tuple)) else file_type.data
|
||||
)
|
||||
|
||||
# Common file type mappings
|
||||
file_type_map = {
|
||||
0: "F32",
|
||||
1: "F16",
|
||||
2: "Q4_0",
|
||||
3: "Q4_1",
|
||||
7: "Q8_0",
|
||||
8: "Q5_0",
|
||||
9: "Q5_1",
|
||||
10: "Q2_K",
|
||||
11: "Q3_K_S",
|
||||
12: "Q3_K_M",
|
||||
13: "Q3_K_L",
|
||||
14: "Q4_K_S",
|
||||
15: "Q4_K_M",
|
||||
16: "Q5_K_S",
|
||||
17: "Q5_K_M",
|
||||
18: "Q6_K",
|
||||
}
|
||||
|
||||
return file_type_map.get(int(file_type_value), f"Unknown ({file_type_value})")
|
||||
|
||||
return None
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""Validate that the GGUF file is properly formatted.
|
||||
|
||||
Returns:
|
||||
True if file is valid, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Check basic structure
|
||||
if not self.reader.fields:
|
||||
logger.error("No metadata fields found")
|
||||
return False
|
||||
|
||||
# Check for required fields
|
||||
required_fields = ["general.architecture"]
|
||||
for field in required_fields:
|
||||
if field not in self.reader.fields:
|
||||
logger.error(f"Missing required field: {field}")
|
||||
return False
|
||||
|
||||
# Check tensors
|
||||
if not self.reader.tensors:
|
||||
logger.warning("No tensors found in file")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Validation failed: {e}")
|
||||
return False
|
||||
else:
|
||||
return True
|
374
helpers/gguf/writer.py
Normal file
374
helpers/gguf/writer.py
Normal file
|
@ -0,0 +1,374 @@
|
|||
"""GGUF file writing operations.
|
||||
|
||||
Provides high-level interface for creating GGUF files with metadata,
|
||||
tensors, and tokeniser information.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import operator
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
|
||||
import gguf
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
from helpers.models.conversion import ModelConfig
|
||||
|
||||
|
||||
class VisionConfig(Protocol):
|
||||
"""Protocol for vision model configuration."""
|
||||
|
||||
hidden_size: int
|
||||
num_hidden_layers: int
|
||||
num_attention_heads: int
|
||||
intermediate_size: int
|
||||
patch_size: int
|
||||
spatial_merge_size: int
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
"""Manages GGUF file creation and metadata writing.
|
||||
|
||||
Provides high-level interface for GGUF file operations including metadata
|
||||
configuration, tensor addition, and tokeniser integration. Encapsulates
|
||||
low-level GGUF library interactions for consistent error handling.
|
||||
"""
|
||||
|
||||
def __init__(self, output_path: Path, architecture: str) -> None:
|
||||
"""Initialise GGUF writer with output path and architecture.
|
||||
|
||||
Creates the underlying GGUF writer instance and prepares for metadata
|
||||
and tensor addition. Sets up the file structure for the specified
|
||||
model architecture.
|
||||
"""
|
||||
self.output_path = output_path
|
||||
self.architecture = architecture
|
||||
self.writer = gguf.GGUFWriter(str(output_path), architecture)
|
||||
logger.info(f"Created GGUF writer for {architecture} architecture")
|
||||
|
||||
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
|
||||
"""Add comprehensive metadata from model configuration.
|
||||
|
||||
Writes general model information, architectural parameters, and
|
||||
quantisation settings to the GGUF file header. Handles both standard
|
||||
and vision model configurations with appropriate parameter mapping.
|
||||
"""
|
||||
# General metadata
|
||||
self.writer.add_name(model_name)
|
||||
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
||||
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
||||
|
||||
# Log architecture being used
|
||||
logger.info(f"Setting GGUF architecture: {self.architecture}")
|
||||
if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
|
||||
logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
|
||||
|
||||
# Model parameters from config
|
||||
params = model_config.to_gguf_params()
|
||||
self.writer.add_context_length(params.context_length)
|
||||
self.writer.add_embedding_length(params.embedding_length)
|
||||
self.writer.add_block_count(params.block_count)
|
||||
self.writer.add_feed_forward_length(params.feed_forward_length)
|
||||
self.writer.add_head_count(params.attention_head_count)
|
||||
self.writer.add_head_count_kv(params.attention_head_count_kv)
|
||||
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
|
||||
self.writer.add_rope_freq_base(params.rope_freq_base)
|
||||
self.writer.add_rope_dimension_count(params.rope_dimension_count)
|
||||
|
||||
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
|
||||
|
||||
def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
|
||||
"""Add vision model parameters to GGUF metadata.
|
||||
|
||||
Configures vision-specific parameters for multimodal models including
|
||||
embedding dimensions, attention heads, and spatial processing settings.
|
||||
"""
|
||||
if not vision_config:
|
||||
return
|
||||
|
||||
logger.info("Adding vision model parameters...")
|
||||
self.writer.add_vision_embedding_length(vision_config.hidden_size)
|
||||
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
|
||||
self.writer.add_vision_head_count(vision_config.num_attention_heads)
|
||||
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
|
||||
self.writer.add_vision_patch_size(vision_config.patch_size)
|
||||
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
|
||||
|
||||
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
|
||||
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
|
||||
|
||||
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
|
||||
"""Add tokeniser metadata to GGUF file.
|
||||
|
||||
Writes special token IDs and tokeniser model type to enable proper
|
||||
text processing during inference. Uses sensible defaults for missing
|
||||
configuration values.
|
||||
"""
|
||||
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
|
||||
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
||||
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
||||
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
||||
|
||||
# Add BOS/EOS token addition flags if available
|
||||
if "add_bos_token" in tokeniser_config:
|
||||
self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
|
||||
if "add_eos_token" in tokeniser_config:
|
||||
self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
|
||||
|
||||
# Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type
|
||||
|
||||
logger.info("Added tokeniser configuration")
|
||||
|
||||
def add_tokeniser_vocabulary(self, model_path: Path) -> None:
|
||||
"""Add full tokeniser vocabulary to GGUF file.
|
||||
|
||||
Loads and embeds the complete tokeniser vocabulary including tokens,
|
||||
merges, and scores to enable standalone model usage without external
|
||||
tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
|
||||
"""
|
||||
tokenizer_path = model_path / "tokenizer.json"
|
||||
if not tokenizer_path.exists():
|
||||
logger.warning("tokenizer.json not found, skipping vocabulary embedding")
|
||||
return
|
||||
|
||||
try:
|
||||
with Path(tokenizer_path).open(encoding="utf-8") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
model_data = tokenizer_data.get("model", {})
|
||||
model_type = model_data.get("type", "")
|
||||
|
||||
# Get pre-tokenizer information
|
||||
pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
|
||||
pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
|
||||
|
||||
# Get added tokens
|
||||
added_tokens = tokenizer_data.get("added_tokens", [])
|
||||
|
||||
if model_type == "BPE":
|
||||
self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
|
||||
elif model_type == "Unigram":
|
||||
self._add_unigram_tokenizer(model_data, added_tokens)
|
||||
elif model_type == "WordPiece":
|
||||
self._add_wordpiece_tokenizer(model_data, added_tokens)
|
||||
else:
|
||||
logger.warning(f"Unsupported tokenizer type: {model_type}")
|
||||
# Try to add as generic tokenizer
|
||||
self._add_generic_tokenizer(model_data, tokenizer_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load tokeniser vocabulary: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
|
||||
"""Determine pre-tokenizer type from configuration.
|
||||
|
||||
Returns:
|
||||
Pre-tokenizer type.
|
||||
"""
|
||||
if not pre_tokenizer:
|
||||
return "default"
|
||||
|
||||
# Check for various pre-tokenizer types
|
||||
pre_type = pre_tokenizer.get("type", "")
|
||||
if "ByteLevel" in str(pre_type):
|
||||
return "llama3"
|
||||
if "Metaspace" in str(pre_type):
|
||||
return "default"
|
||||
|
||||
return "default"
|
||||
|
||||
def _add_bpe_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
added_tokens: list[dict[str, Any]],
|
||||
pre_tokenizer_type: str,
|
||||
) -> None:
|
||||
"""Add BPE tokenizer to GGUF file."""
|
||||
vocab = model_data.get("vocab", {})
|
||||
merges = model_data.get("merges", [])
|
||||
|
||||
# Set tokenizer model based on pre-tokenizer type
|
||||
if pre_tokenizer_type == "llama3":
|
||||
self.writer.add_tokenizer_model("gpt2")
|
||||
self.writer.add_tokenizer_pre("llama3")
|
||||
else:
|
||||
self.writer.add_tokenizer_model("gpt2")
|
||||
|
||||
# Create token list with scores
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
# Add vocabulary tokens
|
||||
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
||||
tokens.append(token_str)
|
||||
scores.append(0.0) # BPE doesn't use scores
|
||||
|
||||
# Determine token type
|
||||
is_added = any(t.get("content") == token_str for t in added_tokens)
|
||||
if is_added:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# Add to writer
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores(scores)
|
||||
self.writer.add_token_types(toktypes)
|
||||
|
||||
# Add merges
|
||||
if merges:
|
||||
self.writer.add_token_merges(merges)
|
||||
|
||||
logger.info(f"Added BPE tokenizer: {len(tokens)} tokens, {len(merges)} merges")
|
||||
|
||||
def _add_unigram_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
added_tokens: list[dict[str, Any]],
|
||||
) -> None:
|
||||
"""Add Unigram tokenizer to GGUF file."""
|
||||
vocab = model_data.get("vocab", [])
|
||||
|
||||
self.writer.add_tokenizer_model("unigram")
|
||||
|
||||
# Create token list with scores
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
# Add vocabulary tokens
|
||||
for token_data in vocab:
|
||||
if isinstance(token_data, list) and len(token_data) >= 2:
|
||||
token_str, score = token_data[0], token_data[1]
|
||||
else:
|
||||
continue
|
||||
|
||||
tokens.append(token_str)
|
||||
scores.append(float(score))
|
||||
|
||||
# Determine token type
|
||||
is_added = any(t.get("content") == token_str for t in added_tokens)
|
||||
if is_added:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# Add to writer
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores(scores)
|
||||
self.writer.add_token_types(toktypes)
|
||||
|
||||
logger.info(f"Added Unigram tokenizer: {len(tokens)} tokens")
|
||||
|
||||
def _add_wordpiece_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
added_tokens: list[dict[str, Any]],
|
||||
) -> None:
|
||||
"""Add WordPiece tokenizer to GGUF file."""
|
||||
vocab = model_data.get("vocab", {})
|
||||
|
||||
self.writer.add_tokenizer_model("bert")
|
||||
|
||||
# Create token list
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
# Add vocabulary tokens
|
||||
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
||||
tokens.append(token_str)
|
||||
scores.append(0.0) # WordPiece doesn't use scores
|
||||
|
||||
# Determine token type
|
||||
is_added = any(t.get("content") == token_str for t in added_tokens)
|
||||
if is_added:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# Add to writer
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores(scores)
|
||||
self.writer.add_token_types(toktypes)
|
||||
|
||||
logger.info(f"Added WordPiece tokenizer: {len(tokens)} tokens")
|
||||
|
||||
def _add_generic_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
tokenizer_data: dict[str, Any],
|
||||
) -> None:
|
||||
"""Add generic tokenizer as fallback."""
|
||||
logger.warning("Using generic tokenizer fallback")
|
||||
|
||||
# Try to extract vocabulary from various possible locations
|
||||
vocab = model_data.get("vocab", tokenizer_data.get("vocab", {}))
|
||||
|
||||
if not vocab:
|
||||
logger.error("No vocabulary found in tokenizer")
|
||||
return
|
||||
|
||||
self.writer.add_tokenizer_model("gpt2") # Default to GPT-2 style
|
||||
|
||||
# Create basic token list
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
||||
if isinstance(vocab, dict):
|
||||
# Dict-style vocab
|
||||
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
||||
tokens.append(token_str)
|
||||
scores.append(0.0)
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
elif isinstance(vocab, list):
|
||||
# List-style vocab
|
||||
for item in vocab:
|
||||
if isinstance(item, str):
|
||||
tokens.append(item)
|
||||
scores.append(0.0)
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
elif isinstance(item, list) and len(item) >= 1:
|
||||
tokens.append(str(item[0]))
|
||||
scores.append(float(item[1]) if len(item) > 1 else 0.0)
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
if tokens:
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores(scores)
|
||||
self.writer.add_token_types(toktypes)
|
||||
logger.info(f"Added generic tokenizer: {len(tokens)} tokens")
|
||||
else:
|
||||
logger.error("Failed to extract tokens from vocabulary")
|
||||
|
||||
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
||||
"""Add tensor to GGUF file.
|
||||
|
||||
Accepts a tensor name following GGUF naming conventions and its
|
||||
corresponding numpy array data. The tensor is stored for writing
|
||||
when the file is finalised.
|
||||
"""
|
||||
self.writer.add_tensor(name, data)
|
||||
|
||||
def write(self) -> None:
|
||||
"""Finalise and write GGUF file to disk.
|
||||
|
||||
Writes header, key-value data, and tensors to the output file,
|
||||
completing the GGUF creation process.
|
||||
"""
|
||||
logger.info(f"Writing GGUF file to {self.output_path}...")
|
||||
self.writer.write_header_to_file()
|
||||
self.writer.write_kv_data_to_file()
|
||||
self.writer.write_tensors_to_file()
|
||||
self.writer.close()
|
||||
logger.info("✅ GGUF file written successfully")
|
19
helpers/huggingface/__init__.py
Normal file
19
helpers/huggingface/__init__.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
"""HuggingFace operations and integrations.
|
||||
|
||||
Provides client operations, repository management, and file upload
|
||||
capabilities for HuggingFace repositories.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.huggingface.client import HuggingFaceClient
|
||||
from helpers.huggingface.repository import RepositoryManager
|
||||
from helpers.huggingface.uploader import FileUploader
|
||||
from helpers.huggingface.wrapper import HuggingFaceUploader
|
||||
|
||||
__all__ = [
|
||||
"FileUploader",
|
||||
"HuggingFaceClient",
|
||||
"HuggingFaceUploader",
|
||||
"RepositoryManager",
|
||||
]
|
124
helpers/huggingface/client.py
Normal file
124
helpers/huggingface/client.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
"""HuggingFace API client operations.
|
||||
|
||||
Provides basic HuggingFace API operations including authentication,
|
||||
model downloads, and user information retrieval.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class HuggingFaceClient:
|
||||
"""Manages basic HuggingFace API operations.
|
||||
|
||||
Provides methods for authentication verification, model downloads,
|
||||
and user information retrieval using the HuggingFace CLI.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Retrieves the current user's HuggingFace username using the CLI.
|
||||
Requires prior authentication via `huggingface-cli login`.
|
||||
|
||||
Returns:
|
||||
HuggingFace username.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated or CLI not available.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
@staticmethod
|
||||
def download_model(
|
||||
model_name: str,
|
||||
output_dir: Path,
|
||||
include_pattern: str | None = None,
|
||||
) -> None:
|
||||
"""Download model from HuggingFace.
|
||||
|
||||
Downloads a complete model or specific files matching a pattern.
|
||||
Creates the output directory if it doesn't exist. Supports filtered
|
||||
downloads for efficient bandwidth usage when only certain files are needed.
|
||||
The model identifier follows HuggingFace naming conventions (e.g. "meta-llama/Llama-2-7b").
|
||||
"""
|
||||
logger.info(f"Downloading {model_name} to {output_dir}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_name,
|
||||
"--local-dir",
|
||||
str(output_dir),
|
||||
]
|
||||
|
||||
if include_pattern:
|
||||
cmd.extend(["--include", include_pattern])
|
||||
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
logger.info("Download complete")
|
||||
|
||||
@staticmethod
|
||||
def check_authentication() -> bool:
|
||||
"""Check if user is authenticated with HuggingFace.
|
||||
|
||||
Returns:
|
||||
True if authenticated, False otherwise.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
logger.error(
|
||||
"huggingface-cli not found. Please install with: pip install huggingface-hub"
|
||||
)
|
||||
return False
|
||||
else:
|
||||
return result.returncode == 0
|
||||
|
||||
@staticmethod
|
||||
def get_model_info(model_id: str) -> dict | None:
|
||||
"""Get model information from HuggingFace.
|
||||
|
||||
Retrieves metadata about a model from the HuggingFace Hub using the
|
||||
CLI interface. Returns the model information as a dictionary if found.
|
||||
|
||||
Returns:
|
||||
Model information dictionary or None if not found.
|
||||
"""
|
||||
try:
|
||||
# Use huggingface-cli to get model info
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "model-info", model_id],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
logger.warning(f"Could not get info for model: {model_id}")
|
||||
return None
|
||||
else:
|
||||
# Parse the output (this is simplified - actual implementation would parse JSON)
|
||||
return {"output": result.stdout}
|
167
helpers/huggingface/repository.py
Normal file
167
helpers/huggingface/repository.py
Normal file
|
@ -0,0 +1,167 @@
|
|||
"""HuggingFace repository management.
|
||||
|
||||
Handles repository creation, configuration, and management operations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
|
||||
class RepositoryManager:
|
||||
"""Manages HuggingFace repository operations.
|
||||
|
||||
Provides methods for creating repositories, checking existence,
|
||||
and managing repository configuration.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_repository(
|
||||
repo_id: str,
|
||||
private: bool = False,
|
||||
repo_type: str = "model",
|
||||
) -> bool:
|
||||
"""Create a new HuggingFace repository.
|
||||
|
||||
Creates a repository with the specified identifier and settings. Repository
|
||||
identifiers follow the format "username/repo-name". Supports model, dataset,
|
||||
and space repository types with configurable visibility.
|
||||
|
||||
Returns:
|
||||
True if repository was created, False if it already exists.
|
||||
"""
|
||||
logger.info(f"Creating repository: {repo_id}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"repo",
|
||||
"create",
|
||||
repo_id,
|
||||
"--type",
|
||||
repo_type,
|
||||
]
|
||||
|
||||
if private:
|
||||
cmd.append("--private")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info(f"Created repository: {repo_id}")
|
||||
return True
|
||||
if "already exists" in result.stderr.lower():
|
||||
logger.info(f"Repository already exists: {repo_id}")
|
||||
return False
|
||||
logger.error(f"Failed to create repository: {result.stderr}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating repository: {e}")
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def ensure_repository_exists(repo_id: str) -> None:
|
||||
"""Ensure repository exists, creating if necessary.
|
||||
|
||||
Attempts to create the repository if it doesn't exist, then waits
|
||||
briefly to ensure the repository is ready for operations.
|
||||
"""
|
||||
# Try to create the repository
|
||||
RepositoryManager.create_repository(repo_id)
|
||||
|
||||
# Small delay to ensure repository is ready
|
||||
time.sleep(2)
|
||||
|
||||
@staticmethod
|
||||
def check_repository_exists(repo_id: str) -> bool:
|
||||
"""Check if a repository exists.
|
||||
|
||||
Queries the HuggingFace Hub to determine if a repository with the
|
||||
given identifier exists and is accessible.
|
||||
|
||||
Returns:
|
||||
True if repository exists, False otherwise.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "repo", "ls-files", repo_id],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
return result.returncode == 0
|
||||
|
||||
@staticmethod
|
||||
def delete_repository(repo_id: str) -> bool:
|
||||
"""Delete a HuggingFace repository.
|
||||
|
||||
Permanently removes a repository from the HuggingFace Hub. This operation
|
||||
cannot be undone and requires appropriate permissions.
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise.
|
||||
"""
|
||||
logger.warning(f"Deleting repository: {repo_id}")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "repo", "delete", repo_id, "--yes"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info(f"Deleted repository: {repo_id}")
|
||||
return True
|
||||
logger.error(f"Failed to delete repository: {result.stderr}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting repository: {e}")
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_repository_url(repo_id: str) -> str:
|
||||
"""Get the full URL for a repository.
|
||||
|
||||
Constructs the complete HuggingFace Hub URL for accessing the repository
|
||||
through a web browser.
|
||||
|
||||
Returns:
|
||||
Full HuggingFace URL for the repository.
|
||||
"""
|
||||
return f"https://huggingface.co/{repo_id}"
|
||||
|
||||
@staticmethod
|
||||
def set_repository_visibility(repo_id: str, private: bool) -> bool:
|
||||
"""Set repository visibility (public/private).
|
||||
|
||||
Changes the visibility setting of an existing repository. Private repositories
|
||||
require appropriate permissions and may have usage limitations.
|
||||
|
||||
Returns:
|
||||
True if visibility changed successfully.
|
||||
"""
|
||||
visibility = "private" if private else "public"
|
||||
logger.info(f"Setting {repo_id} visibility to {visibility}")
|
||||
|
||||
try:
|
||||
# Note: This would require using the HuggingFace API directly
|
||||
# as the CLI doesn't support changing visibility
|
||||
logger.warning("Changing repository visibility requires API access")
|
||||
except Exception as e:
|
||||
logger.error(f"Error changing visibility: {e}")
|
||||
|
||||
return False
|
330
helpers/huggingface/uploader.py
Normal file
330
helpers/huggingface/uploader.py
Normal file
|
@ -0,0 +1,330 @@
|
|||
"""HuggingFace file upload operations.
|
||||
|
||||
Handles uploading files to HuggingFace repositories with retry logic
|
||||
and error handling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.huggingface.repository import RepositoryManager
|
||||
from helpers.logger import logger
|
||||
|
||||
|
||||
class FileUploader:
|
||||
"""Manages file uploads to HuggingFace repositories.
|
||||
|
||||
Provides methods for uploading models, READMEs, and other files
|
||||
with proper error handling, retry logic, and git-based fallbacks.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def upload_file(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str | None = None,
|
||||
create_repo: bool = False,
|
||||
) -> None:
|
||||
"""Upload a file to HuggingFace repository.
|
||||
|
||||
Uploads a single file to the specified repository path. Can create
|
||||
the repository if it doesn't exist. Uses git directly when possible
|
||||
to avoid automatic PR creation. Repository identifiers follow the format
|
||||
"username/repo-name". Files are uploaded to the main branch by default.
|
||||
|
||||
Raises:
|
||||
CalledProcessError: If upload fails.
|
||||
"""
|
||||
repo_path = repo_path or local_path.name
|
||||
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
|
||||
|
||||
# Try git-based upload first to avoid PR creation
|
||||
if FileUploader._try_git_upload(repo_id, local_path, repo_path, create_repo=create_repo):
|
||||
logger.info(f"Uploaded {repo_path} via git")
|
||||
return
|
||||
|
||||
# Fallback to huggingface-cli
|
||||
logger.info("Git upload failed, trying huggingface-cli...")
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(local_path),
|
||||
repo_path,
|
||||
"--revision",
|
||||
"main", # Explicitly push to main branch
|
||||
"--commit-message",
|
||||
f"Add {repo_path}",
|
||||
]
|
||||
|
||||
if create_repo:
|
||||
cmd.append("--create")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
logger.info(f"Uploaded {repo_path}")
|
||||
except subprocess.CalledProcessError:
|
||||
if create_repo:
|
||||
# Repository might already exist, retry without --create
|
||||
cmd = cmd[:-1] # Remove --create flag
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
logger.info(f"Updated {repo_path}")
|
||||
else:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def _try_git_upload(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str,
|
||||
*,
|
||||
create_repo: bool = False,
|
||||
) -> bool:
|
||||
"""Try to upload file using git directly to avoid PR creation.
|
||||
|
||||
Returns:
|
||||
bool: True if upload successful, False if should fallback to CLI.
|
||||
"""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
repo_url = f"https://huggingface.co/{repo_id}"
|
||||
|
||||
# Clone repository
|
||||
logger.info(f"Cloning {repo_url}...")
|
||||
result = subprocess.run(
|
||||
["git", "clone", repo_url, str(temp_path / "repo")],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
if create_repo:
|
||||
# Repository doesn't exist, let huggingface-cli handle creation
|
||||
return False
|
||||
logger.warning(f"Clone failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
repo_dir = temp_path / "repo"
|
||||
target_file = repo_dir / repo_path
|
||||
|
||||
# Ensure target directory exists
|
||||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy file
|
||||
shutil.copy2(local_path, target_file)
|
||||
|
||||
# Check if there are any changes
|
||||
status_result = subprocess.run(
|
||||
["git", "status", "--porcelain"],
|
||||
cwd=repo_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
if not status_result.stdout.strip():
|
||||
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
|
||||
return True # File is already up-to-date, no need to push
|
||||
|
||||
# Git add, commit, push
|
||||
subprocess.run(
|
||||
["git", "add", repo_path],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", f"Update {repo_path}"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "push"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Git upload failed: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Git upload error: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def upload_readme(
|
||||
repo_id: str,
|
||||
readme_path: Path,
|
||||
ensure_repo: bool = True,
|
||||
) -> None:
|
||||
"""Upload or update README file to repository.
|
||||
|
||||
Creates repository if needed, handles existing repository updates.
|
||||
The README is uploaded as README.md in the repository root and will
|
||||
replace any existing README file.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the README upload fails.
|
||||
"""
|
||||
logger.info("Uploading README...")
|
||||
|
||||
# Add delay to prevent rate limiting
|
||||
time.sleep(2)
|
||||
|
||||
# First ensure the repository exists if requested
|
||||
if ensure_repo:
|
||||
RepositoryManager.ensure_repository_exists(repo_id)
|
||||
|
||||
# Upload without --create flag to avoid PR creation
|
||||
try:
|
||||
logger.debug(f"DEBUG: Uploading README to {repo_id}")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
"--commit-message",
|
||||
"Update README.md",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.info("README uploaded successfully")
|
||||
except subprocess.CalledProcessError as e:
|
||||
# Retry with delay in case of rate limiting
|
||||
if "429" in str(e.stderr):
|
||||
logger.warning("Rate limited, waiting 30 seconds...")
|
||||
time.sleep(30)
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
"--commit-message",
|
||||
"Update README.md",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.info("README uploaded successfully (after retry)")
|
||||
else:
|
||||
msg = f"Failed to upload README: {e.stderr}"
|
||||
raise RuntimeError(msg) from e
|
||||
|
||||
@staticmethod
|
||||
def upload_model_file(
|
||||
repo_id: str,
|
||||
model_path: Path,
|
||||
repo_filename: str | None = None,
|
||||
) -> None:
|
||||
"""Upload a model file to repository.
|
||||
|
||||
Optimised for large model file uploads with progress tracking.
|
||||
The model file is uploaded to the repository root by default or
|
||||
to the specified filename if provided.
|
||||
|
||||
Raises:
|
||||
subprocess.CalledProcessError: If the upload fails.
|
||||
"""
|
||||
repo_filename = repo_filename or model_path.name
|
||||
logger.info(
|
||||
f"Uploading model file {model_path.name} "
|
||||
f"({model_path.stat().st_size / (1024**3):.1f}GB)..."
|
||||
)
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(model_path),
|
||||
repo_filename,
|
||||
"--commit-message",
|
||||
f"Add {repo_filename}",
|
||||
]
|
||||
|
||||
try:
|
||||
# Run with output streaming for large files
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
universal_newlines=True,
|
||||
)
|
||||
|
||||
# Stream output
|
||||
if process.stdout:
|
||||
for line in iter(process.stdout.readline, ""):
|
||||
if line and "upload" in line.lower():
|
||||
logger.debug(line.strip())
|
||||
|
||||
process.wait()
|
||||
|
||||
if process.returncode != 0:
|
||||
raise subprocess.CalledProcessError(process.returncode, cmd)
|
||||
|
||||
logger.info(f"Successfully uploaded {repo_filename}")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to upload model file: {e}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def upload_folder(
|
||||
repo_id: str,
|
||||
folder_path: Path,
|
||||
path_in_repo: str = ".",
|
||||
ignore_patterns: list[str] | None = None,
|
||||
) -> None:
|
||||
"""Upload an entire folder to repository.
|
||||
|
||||
Recursively uploads all files from a local folder to the repository,
|
||||
preserving the directory structure. Supports ignore patterns for
|
||||
selective uploads.
|
||||
|
||||
Raises:
|
||||
subprocess.CalledProcessError: If the upload fails.
|
||||
"""
|
||||
logger.info(f"Uploading folder {folder_path} to {repo_id}/{path_in_repo}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(folder_path),
|
||||
path_in_repo,
|
||||
"--commit-message",
|
||||
f"Upload {folder_path.name}",
|
||||
]
|
||||
|
||||
if ignore_patterns:
|
||||
for pattern in ignore_patterns:
|
||||
cmd.extend(["--exclude", pattern])
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
logger.info(f"Successfully uploaded folder {folder_path.name}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Failed to upload folder: {e}")
|
||||
raise
|
57
helpers/huggingface/wrapper.py
Normal file
57
helpers/huggingface/wrapper.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
"""Compatibility wrapper for HuggingFace operations.
|
||||
|
||||
Provides a compatible interface matching the old HuggingFaceUploader
|
||||
class for backward compatibility during refactoring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.huggingface.client import HuggingFaceClient
|
||||
from helpers.huggingface.repository import RepositoryManager
|
||||
from helpers.huggingface.uploader import FileUploader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class HuggingFaceUploader:
|
||||
"""Compatibility wrapper for HuggingFace operations.
|
||||
|
||||
Maintains the same interface as the old HuggingFaceUploader class
|
||||
while using the new modular components internally.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Returns:
|
||||
HuggingFace username from CLI authentication.
|
||||
"""
|
||||
return HuggingFaceClient.get_username()
|
||||
|
||||
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
|
||||
"""Upload or update README file to repository.
|
||||
|
||||
Creates repository if needed, handles existing repository updates.
|
||||
The README is uploaded to the repository root as README.md.
|
||||
"""
|
||||
FileUploader.upload_readme(output_repo, readme_path, ensure_repo=True)
|
||||
|
||||
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
|
||||
"""Upload model file to repository.
|
||||
|
||||
Uploads GGUF model file to specified repository path. The file
|
||||
is uploaded with progress tracking suitable for large model files.
|
||||
"""
|
||||
FileUploader.upload_model_file(output_repo, model_path)
|
||||
|
||||
def _ensure_repo_exists(self, repo_id: str) -> None:
|
||||
"""Ensure the repository exists, creating it if necessary.
|
||||
|
||||
Creates the repository if it doesn't exist and waits briefly
|
||||
to ensure it's ready for subsequent operations.
|
||||
"""
|
||||
RepositoryManager.ensure_repository_exists(repo_id)
|
20
helpers/llama_cpp/__init__.py
Normal file
20
helpers/llama_cpp/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
"""llama.cpp operations and binary management.
|
||||
|
||||
Provides interfaces to llama.cpp binaries for quantisation and
|
||||
importance matrix generation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.llama_cpp.architecture import ArchitectureDetector
|
||||
from helpers.llama_cpp.binary_manager import BinaryManager
|
||||
from helpers.llama_cpp.imatrix import IMatrixGenerator, IMatrixHandler
|
||||
from helpers.llama_cpp.quantiser import QuantisationExecutor
|
||||
|
||||
__all__ = [
|
||||
"ArchitectureDetector",
|
||||
"BinaryManager",
|
||||
"IMatrixGenerator",
|
||||
"IMatrixHandler",
|
||||
"QuantisationExecutor",
|
||||
]
|
235
helpers/llama_cpp/architecture.py
Normal file
235
helpers/llama_cpp/architecture.py
Normal file
|
@ -0,0 +1,235 @@
|
|||
"""Architecture detection and support checking.
|
||||
|
||||
Determines whether model architectures are supported by llama.cpp
|
||||
and provides fallback strategies for unsupported architectures.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ArchitectureDetector:
|
||||
"""Detects and validates model architecture support.
|
||||
|
||||
Checks whether model architectures are supported by llama.cpp
|
||||
for K-quant generation and determines appropriate quantisation
|
||||
strategies for unsupported architectures.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def check_architecture_support(f16_model_path: Path) -> bool:
|
||||
"""Check if the model architecture is supported by llama.cpp.
|
||||
|
||||
Tests the model's compatibility by attempting a quantisation with
|
||||
llama.cpp. Returns true if the architecture is unsupported, indicating
|
||||
that K-quants should be skipped.
|
||||
|
||||
Returns:
|
||||
True if architecture is NOT supported (K-quants should be skipped)
|
||||
"""
|
||||
try:
|
||||
# Try a simple quantization with llama.cpp to check support
|
||||
result = subprocess.run(
|
||||
[
|
||||
".cache/llm-gguf-tools/binaries/llama-quantize",
|
||||
str(f16_model_path),
|
||||
"/dev/null",
|
||||
"Q4_K_M",
|
||||
],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
# Check if it failed due to unknown architecture
|
||||
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
|
||||
except Exception:
|
||||
# If we can't determine, assume it might work
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_supported_architectures() -> list[str]:
|
||||
"""Get list of architectures known to be supported by llama.cpp.
|
||||
|
||||
Returns:
|
||||
List of supported architecture names.
|
||||
"""
|
||||
return [
|
||||
"llama",
|
||||
"llama2",
|
||||
"llama3",
|
||||
"mistral",
|
||||
"mixtral",
|
||||
"qwen",
|
||||
"qwen2",
|
||||
"gemma",
|
||||
"gemma2",
|
||||
"phi",
|
||||
"phi2",
|
||||
"phi3",
|
||||
"falcon",
|
||||
"gpt2",
|
||||
"gptj",
|
||||
"gptneox",
|
||||
"mpt",
|
||||
"starcoder",
|
||||
"starcoder2",
|
||||
"baichuan",
|
||||
"bert",
|
||||
"bloom",
|
||||
"deepseek",
|
||||
"deepseek2",
|
||||
"chatglm",
|
||||
"orion",
|
||||
"internlm2",
|
||||
"minicpm",
|
||||
"stablelm",
|
||||
"cohere",
|
||||
"dbrx",
|
||||
"olmo",
|
||||
"arctic",
|
||||
"rwkv",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def map_architecture(model_type: str, arch_name: str) -> str:
|
||||
"""Map model architecture to GGUF architecture string.
|
||||
|
||||
Translates model type and architecture names from HuggingFace config
|
||||
to GGUF-compatible architecture identifiers. Handles special cases like
|
||||
"gpt-oss" to "gptoss" conversion and provides fallback mapping.
|
||||
|
||||
Returns:
|
||||
GGUF architecture string to use.
|
||||
"""
|
||||
# Direct mappings from model_type
|
||||
type_mappings = {
|
||||
"llama": "llama",
|
||||
"mistral": "llama", # Mistral uses llama architecture
|
||||
"mixtral": "llama",
|
||||
"qwen": "qwen",
|
||||
"qwen2": "qwen2",
|
||||
"gemma": "gemma",
|
||||
"gemma2": "gemma2",
|
||||
"phi": "phi2",
|
||||
"phi3": "phi3",
|
||||
"phi-msft": "phi2",
|
||||
"falcon": "falcon",
|
||||
"gpt2": "gpt2",
|
||||
"gptj": "gptj",
|
||||
"gpt_neox": "gptneox",
|
||||
"gpt-oss": "gptoss",
|
||||
"mpt": "mpt",
|
||||
"starcoder": "starcoder",
|
||||
"starcoder2": "starcoder2",
|
||||
"baichuan": "baichuan",
|
||||
"bloom": "bloom",
|
||||
"chatglm": "chatglm",
|
||||
"deepseek": "llama", # DeepSeek uses llama architecture
|
||||
"stablelm": "stablelm",
|
||||
"cohere": "cohere",
|
||||
"dbrx": "dbrx",
|
||||
"olmo": "olmo",
|
||||
"arctic": "arctic",
|
||||
}
|
||||
|
||||
# Check model_type first
|
||||
if model_type in type_mappings:
|
||||
return type_mappings[model_type]
|
||||
|
||||
# Architecture name mappings as fallback
|
||||
arch_mappings = {
|
||||
"LlamaForCausalLM": "llama",
|
||||
"MistralForCausalLM": "llama",
|
||||
"MixtralForCausalLM": "llama",
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
"QwenForCausalLM": "qwen",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Gemma2ForCausalLM": "gemma2",
|
||||
"GptOssForCausalLM": "gptoss",
|
||||
"PhiForCausalLM": "phi2",
|
||||
"Phi3ForCausalLM": "phi3",
|
||||
"FalconForCausalLM": "falcon",
|
||||
"GPT2LMHeadModel": "gpt2",
|
||||
"GPTJForCausalLM": "gptj",
|
||||
"GPTNeoXForCausalLM": "gptneox",
|
||||
"MPTForCausalLM": "mpt",
|
||||
"BloomForCausalLM": "bloom",
|
||||
"ChatGLMForCausalLM": "chatglm",
|
||||
"StableLmForCausalLM": "stablelm",
|
||||
"CohereForCausalLM": "cohere",
|
||||
}
|
||||
|
||||
if arch_name in arch_mappings:
|
||||
return arch_mappings[arch_name]
|
||||
|
||||
# Default fallback
|
||||
logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})")
|
||||
logger.warning("Defaulting to 'llama' architecture - may not work correctly")
|
||||
return "llama"
|
||||
|
||||
@staticmethod
|
||||
def get_quantisation_support(architecture: str) -> dict[str, bool]:
|
||||
"""Determine which quantisation types are supported for an architecture.
|
||||
|
||||
Evaluates architecture compatibility with different quantisation methods.
|
||||
Basic quantisations are always supported via GGML, while K-quants and
|
||||
imatrix require specific llama.cpp support.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping quantisation type categories to support status.
|
||||
"""
|
||||
# Known unsupported architectures for K-quants
|
||||
unsupported_kquants = [
|
||||
"bert",
|
||||
"dotsocr", # Custom/unknown architectures
|
||||
]
|
||||
|
||||
is_supported = architecture not in unsupported_kquants
|
||||
|
||||
return {
|
||||
"basic": True, # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML
|
||||
"k_quants": is_supported, # K-quants require llama.cpp support
|
||||
"imatrix": is_supported, # imatrix requires llama.cpp support
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def filter_quantisation_types(
|
||||
architecture: str,
|
||||
requested_types: list[str],
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""Filter quantisation types based on architecture support.
|
||||
|
||||
Separates requested quantisation types into supported and unsupported
|
||||
based on the model's architecture capabilities. Basic types are always
|
||||
supported, while K-quants depend on architecture compatibility.
|
||||
|
||||
Returns:
|
||||
Tuple of (supported_types, skipped_types).
|
||||
"""
|
||||
support = ArchitectureDetector.get_quantisation_support(architecture)
|
||||
basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}
|
||||
|
||||
supported = []
|
||||
skipped = []
|
||||
|
||||
for quant_type in requested_types:
|
||||
if quant_type in basic_types:
|
||||
# Basic types always supported
|
||||
supported.append(quant_type)
|
||||
elif support["k_quants"]:
|
||||
# K-quants supported for this architecture
|
||||
supported.append(quant_type)
|
||||
else:
|
||||
# K-quants not supported
|
||||
skipped.append(quant_type)
|
||||
|
||||
return supported, skipped
|
494
helpers/llama_cpp/binary_manager.py
Normal file
494
helpers/llama_cpp/binary_manager.py
Normal file
|
@ -0,0 +1,494 @@
|
|||
"""Binary manager for llama.cpp releases.
|
||||
|
||||
Downloads and manages llama.cpp binary releases from GitHub, handling
|
||||
platform detection, version checking, and caching.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import tarfile
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, ClassVar
|
||||
from urllib.request import urlopen, urlretrieve
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
|
||||
|
||||
class BinaryManager:
|
||||
"""Manages llama.cpp binary downloads and updates.
|
||||
|
||||
Automatically downloads appropriate llama.cpp releases based on platform,
|
||||
caches binaries locally, and checks for updates from GitHub releases.
|
||||
"""
|
||||
|
||||
GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest"
|
||||
# Use local .cache directory in project
|
||||
BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries"
|
||||
|
||||
# Platform mappings to release asset patterns
|
||||
PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = {
|
||||
("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"],
|
||||
("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"],
|
||||
("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"],
|
||||
("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"],
|
||||
("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"],
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise binary manager."""
|
||||
self.BINARY_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.version_file = self.BINARY_DIR / "version.json"
|
||||
self.quantize_binary_path = self._get_binary_path("llama-quantize")
|
||||
self.imatrix_binary_path = self._get_binary_path("llama-imatrix")
|
||||
|
||||
def _get_binary_path(self, base_name: str) -> Path:
|
||||
"""Get path to binary.
|
||||
|
||||
Constructs the full path to a binary executable based on the base
|
||||
name, automatically adding the appropriate file extension for the
|
||||
current operating system platform.
|
||||
|
||||
Returns:
|
||||
Path where binary should be located.
|
||||
"""
|
||||
binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name
|
||||
return self.BINARY_DIR / binary_name
|
||||
|
||||
def get_quantise_binary(self) -> Path | None:
|
||||
"""Get llama-quantize binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
return self._get_binary("llama-quantize", self.quantize_binary_path)
|
||||
|
||||
def get_imatrix_binary(self) -> Path | None:
|
||||
"""Get llama-imatrix binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
return self._get_binary("llama-imatrix", self.imatrix_binary_path)
|
||||
|
||||
def _get_binary(self, name: str, binary_path: Path) -> Path | None:
|
||||
"""Get a specific binary, downloading if necessary.
|
||||
|
||||
Checks for existing binaries and downloads the latest release if
|
||||
updates are needed. Falls back to existing binaries if download
|
||||
fails, ensuring robust binary availability for quantisation tasks.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
# Check if we have a binary and if it needs updating
|
||||
if self._should_update():
|
||||
logger.info("🔄 Checking for llama.cpp updates...")
|
||||
if not self._download_latest():
|
||||
logger.warning("Failed to download latest llama.cpp release")
|
||||
# Fall back to existing binary if available
|
||||
if binary_path.exists():
|
||||
logger.info(f"Using existing {name} binary")
|
||||
return binary_path
|
||||
return None
|
||||
|
||||
if binary_path.exists():
|
||||
return binary_path
|
||||
|
||||
logger.info("📥 Downloading llama.cpp binaries...")
|
||||
if self._download_latest():
|
||||
return binary_path
|
||||
|
||||
return None
|
||||
|
||||
def _should_update(self) -> bool:
|
||||
"""Check if binary needs updating.
|
||||
|
||||
Returns:
|
||||
True if update needed, False otherwise.
|
||||
"""
|
||||
# If no binaries exist, we need to download
|
||||
if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists():
|
||||
return True
|
||||
|
||||
# Check version file
|
||||
if not self.version_file.exists():
|
||||
return True
|
||||
|
||||
try:
|
||||
with Path(self.version_file).open(encoding="utf-8") as f:
|
||||
cached_version = json.load(f)
|
||||
|
||||
# Check if cached version is older than 7 days
|
||||
if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600:
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _download_latest(self) -> bool:
|
||||
"""Download latest llama.cpp release.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Get latest release info
|
||||
release_info = self._get_latest_release()
|
||||
if not release_info:
|
||||
return False
|
||||
|
||||
# Find appropriate asset for platform
|
||||
asset_url = self._find_platform_asset(release_info["assets"])
|
||||
if not asset_url:
|
||||
logger.warning("No suitable binary found for this platform")
|
||||
return False
|
||||
|
||||
# Download and extract
|
||||
logger.info(f"📥 Downloading from: {asset_url}")
|
||||
if not self._download_and_extract(asset_url):
|
||||
return False
|
||||
|
||||
# Save version info
|
||||
self._save_version_info(release_info)
|
||||
|
||||
logger.info("✅ Successfully downloaded llama.cpp binary")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download llama.cpp: {e}")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _get_latest_release(self) -> dict[str, Any] | None:
|
||||
"""Get latest release info from GitHub API.
|
||||
|
||||
Returns:
|
||||
Release info dict or None if failed.
|
||||
"""
|
||||
try:
|
||||
with urlopen(self.GITHUB_API) as response: # noqa: S310
|
||||
return json.loads(response.read())
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch release info: {e}")
|
||||
return None
|
||||
|
||||
def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None:
|
||||
"""Find appropriate asset for current platform.
|
||||
|
||||
Returns:
|
||||
Download URL for appropriate asset or None.
|
||||
"""
|
||||
patterns = self._get_platform_patterns()
|
||||
if not patterns:
|
||||
return None
|
||||
|
||||
return self._select_best_asset(assets, patterns)
|
||||
|
||||
def _get_platform_patterns(self) -> list[str]:
|
||||
"""Get platform patterns for current system.
|
||||
|
||||
Returns:
|
||||
List of patterns to match in asset names.
|
||||
"""
|
||||
system = platform.system()
|
||||
machine = platform.machine()
|
||||
|
||||
# Get specific patterns for this platform
|
||||
patterns = self.PLATFORM_PATTERNS.get((system, machine), [])
|
||||
if patterns:
|
||||
return patterns
|
||||
|
||||
# Fall back to generic patterns
|
||||
generic_patterns = {
|
||||
"Linux": ["linux", "ubuntu"],
|
||||
"Darwin": ["macos", "darwin"],
|
||||
"Windows": ["win", "windows"],
|
||||
}
|
||||
return generic_patterns.get(system, [])
|
||||
|
||||
def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None:
|
||||
"""Select the best asset from available options.
|
||||
|
||||
Returns:
|
||||
Download URL for best matching asset or None.
|
||||
"""
|
||||
avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"]
|
||||
prefer_patterns = ["cpu", "vulkan", "avx2", "avx"]
|
||||
|
||||
best_asset = None
|
||||
best_score = -1
|
||||
|
||||
for asset in assets:
|
||||
name = asset["name"].lower()
|
||||
|
||||
# Skip GPU-specific builds
|
||||
if any(pattern in name for pattern in avoid_patterns):
|
||||
continue
|
||||
|
||||
# Check platform match
|
||||
if not any(pattern in name for pattern in patterns):
|
||||
continue
|
||||
|
||||
score = self._score_asset(name, patterns, prefer_patterns)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_asset = asset
|
||||
|
||||
return best_asset["browser_download_url"] if best_asset else None
|
||||
|
||||
def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int:
|
||||
"""Score an asset based on platform and preference matching.
|
||||
|
||||
Returns:
|
||||
Numeric score for asset quality (higher is better).
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Platform match bonus
|
||||
if any(pattern in name for pattern in patterns):
|
||||
score += 10
|
||||
|
||||
# Preference bonuses
|
||||
for pattern in prefer_patterns:
|
||||
if pattern in name:
|
||||
score += 5
|
||||
|
||||
# Archive format preference
|
||||
system = platform.system()
|
||||
if (system == "Windows" and name.endswith(".zip")) or (
|
||||
system != "Windows" and name.endswith(".tar.gz")
|
||||
):
|
||||
score += 2
|
||||
|
||||
return score
|
||||
|
||||
def _download_and_extract(self, url: str) -> bool:
|
||||
"""Download and extract binary archive.
|
||||
|
||||
Downloads the binary archive from the specified URL and extracts
|
||||
the necessary binaries and shared libraries. Handles both ZIP and
|
||||
TAR.GZ formats with appropriate platform-specific permissions.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Download to temp file
|
||||
temp_file = self.BINARY_DIR / "temp_download"
|
||||
logger.info("⬇️ Downloading archive...")
|
||||
urlretrieve(url, temp_file) # noqa: S310
|
||||
|
||||
# Extract based on file type
|
||||
if url.endswith(".zip"):
|
||||
with zipfile.ZipFile(temp_file, "r") as zf:
|
||||
self._extract_binary_from_archive(zf)
|
||||
elif url.endswith((".tar.gz", ".tgz")):
|
||||
with tarfile.open(temp_file, "r:gz") as tf:
|
||||
self._extract_binary_from_archive(tf)
|
||||
else:
|
||||
logger.error(f"Unknown archive format: {url}")
|
||||
return False
|
||||
|
||||
# Clean up temp file
|
||||
temp_file.unlink()
|
||||
|
||||
# Make binaries executable on Unix
|
||||
if platform.system() != "Windows":
|
||||
self.quantize_binary_path.chmod(0o755)
|
||||
self.imatrix_binary_path.chmod(0o755)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download and extract: {e}")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _extract_binary_from_archive(self, archive: Any) -> None:
|
||||
"""Extract llama binaries and their dependencies from archive."""
|
||||
target_binaries = {
|
||||
"llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"],
|
||||
"llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"],
|
||||
}
|
||||
|
||||
# Also extract shared libraries
|
||||
shared_libs = [
|
||||
"libllama.so",
|
||||
"libggml-base.so",
|
||||
"libggml.so",
|
||||
"libllama.dll",
|
||||
"libggml.dll",
|
||||
]
|
||||
|
||||
members = self._get_archive_members(archive)
|
||||
extracted = self._extract_matching_binaries(archive, members, target_binaries)
|
||||
self._extract_shared_libraries(archive, members, shared_libs)
|
||||
self._cleanup_extracted_directories()
|
||||
self._report_missing_binaries(extracted)
|
||||
|
||||
def _get_archive_members(self, archive: Any) -> list[str]:
|
||||
"""Get list of members from archive.
|
||||
|
||||
Returns:
|
||||
List of member names in the archive.
|
||||
"""
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
return archive.namelist()
|
||||
return [m.name for m in archive.getmembers()]
|
||||
|
||||
def _extract_matching_binaries(
|
||||
self,
|
||||
archive: Any,
|
||||
members: list[str],
|
||||
target_binaries: dict[str, list[str]],
|
||||
) -> set[str]:
|
||||
"""Extract binaries that match target patterns.
|
||||
|
||||
Returns:
|
||||
Set of successfully extracted binary types.
|
||||
"""
|
||||
extracted = set()
|
||||
for member in members:
|
||||
base_name = Path(member).name
|
||||
|
||||
for binary_type, possible_names in target_binaries.items():
|
||||
if base_name in possible_names:
|
||||
self._extract_single_binary(archive, member, binary_type)
|
||||
extracted.add(binary_type)
|
||||
break
|
||||
return extracted
|
||||
|
||||
def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None:
|
||||
"""Extract a single binary from archive."""
|
||||
logger.info(f"📦 Extracting {Path(member).name} as {binary_type}...")
|
||||
target_path = self._get_binary_path(binary_type)
|
||||
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
self._extract_from_zip(archive, member, target_path)
|
||||
else: # tarfile
|
||||
self._extract_from_tar(archive, member, target_path)
|
||||
|
||||
def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None:
|
||||
"""Extract binary from zip archive."""
|
||||
temp_path = self.BINARY_DIR / "temp_binary"
|
||||
with archive.open(member) as source, temp_path.open("wb") as target:
|
||||
shutil.copyfileobj(source, target)
|
||||
shutil.move(str(temp_path), str(target_path))
|
||||
|
||||
def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None:
|
||||
"""Extract binary from tar archive."""
|
||||
archive.extract(member, self.BINARY_DIR)
|
||||
extracted_path = self.BINARY_DIR / member
|
||||
if extracted_path != target_path:
|
||||
shutil.move(str(extracted_path), str(target_path))
|
||||
|
||||
def _cleanup_extracted_directories(self) -> None:
|
||||
"""Clean up any extracted directories."""
|
||||
for item in self.BINARY_DIR.iterdir():
|
||||
if item.is_dir() and item.name != "binaries":
|
||||
shutil.rmtree(item)
|
||||
|
||||
def _extract_shared_libraries(
|
||||
self, archive: Any, members: list[str], lib_patterns: list[str]
|
||||
) -> None:
|
||||
"""Extract shared libraries needed by the binaries.
|
||||
|
||||
Searches through archive members to find shared libraries matching
|
||||
the specified patterns and extracts them to ensure proper binary
|
||||
functionality. Sets appropriate permissions on Unix systems.
|
||||
"""
|
||||
for member in members:
|
||||
base_name = Path(member).name
|
||||
if any(lib in base_name for lib in lib_patterns):
|
||||
logger.info(f"📚 Extracting library: {base_name}")
|
||||
target_path = self.BINARY_DIR / base_name
|
||||
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
temp_path = self.BINARY_DIR / "temp_lib"
|
||||
with archive.open(member) as source, temp_path.open("wb") as target:
|
||||
shutil.copyfileobj(source, target)
|
||||
shutil.move(str(temp_path), str(target_path))
|
||||
else: # tarfile
|
||||
archive.extract(member, self.BINARY_DIR)
|
||||
extracted_path = self.BINARY_DIR / member
|
||||
if extracted_path != target_path:
|
||||
shutil.move(str(extracted_path), str(target_path))
|
||||
|
||||
# Make libraries executable on Unix
|
||||
if platform.system() != "Windows":
|
||||
target_path.chmod(0o755)
|
||||
|
||||
def _report_missing_binaries(self, extracted: set[str]) -> None:
|
||||
"""Report any missing binaries."""
|
||||
if "llama-quantize" not in extracted:
|
||||
logger.warning("llama-quantize binary not found in archive")
|
||||
if "llama-imatrix" not in extracted:
|
||||
logger.warning("llama-imatrix binary not found in archive")
|
||||
|
||||
def _save_version_info(self, release_info: dict[str, Any]) -> None:
|
||||
"""Save version information to cache.
|
||||
|
||||
Stores release version, timestamp, and URL information to the local
|
||||
cache to enable version checking and update determination for
|
||||
future binary manager operations.
|
||||
"""
|
||||
version_data = {
|
||||
"version": release_info.get("tag_name", "unknown"),
|
||||
"timestamp": time.time(),
|
||||
"url": release_info.get("html_url", ""),
|
||||
}
|
||||
|
||||
with Path(self.version_file).open("w", encoding="utf-8") as f:
|
||||
json.dump(version_data, f, indent=2)
|
||||
|
||||
logger.info(f"📌 Cached version: {version_data['version']}")
|
||||
|
||||
def check_binary_works(self, binary_path: Path | None = None) -> bool:
|
||||
"""Check if the binary actually works.
|
||||
|
||||
Validates that the specified binary can execute properly by running
|
||||
a help command with appropriate environment variables set for shared
|
||||
library loading. Defaults to checking the quantise binary if no path provided.
|
||||
|
||||
Returns:
|
||||
True if binary executes successfully, False otherwise.
|
||||
"""
|
||||
if binary_path is None:
|
||||
binary_path = self.quantize_binary_path
|
||||
|
||||
if not binary_path.exists():
|
||||
return False
|
||||
|
||||
try:
|
||||
# Set LD_LIBRARY_PATH to include binary directory for shared libraries
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
|
||||
result = subprocess.run(
|
||||
[str(binary_path), "--help"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
env=env,
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
# llama-quantize returns 1 for --help but shows usage, which means it works
|
||||
return result.returncode in {0, 1} and "usage:" in result.stdout.lower()
|
322
helpers/llama_cpp/imatrix.py
Normal file
322
helpers/llama_cpp/imatrix.py
Normal file
|
@ -0,0 +1,322 @@
|
|||
"""Importance matrix operations for llama.cpp.
|
||||
|
||||
Handles importance matrix generation and management for improved
|
||||
quantisation quality.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.llama_cpp.binary_manager import BinaryManager
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource
|
||||
|
||||
|
||||
class IMatrixHandler:
|
||||
"""Handles importance matrix file management.
|
||||
|
||||
Manages detection and use of existing importance matrix files for
|
||||
quantisation guidance.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixHandler."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def find_imatrix(self, model_dir: Path) -> Path | None:
|
||||
"""Find existing imatrix file in model directory.
|
||||
|
||||
Returns:
|
||||
Path to imatrix file if found, None otherwise.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
return None
|
||||
|
||||
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
|
||||
"""Prompt user for existing imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to user-provided imatrix, or None if not available.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
logger.info(f"Model directory: {model_dir}")
|
||||
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
||||
logger.info(
|
||||
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
|
||||
)
|
||||
logger.info(
|
||||
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
|
||||
)
|
||||
|
||||
response = (
|
||||
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
|
||||
if response != "y":
|
||||
return None
|
||||
|
||||
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"Found imatrix file! ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
logger.warning("No imatrix.dat file found - continuing without imatrix")
|
||||
return None
|
||||
|
||||
|
||||
class IMatrixGenerator:
|
||||
"""Generates importance matrices for quantisation guidance.
|
||||
|
||||
Uses llama-imatrix binary to compute importance matrices from
|
||||
calibration data, which helps preserve model quality during
|
||||
quantisation by identifying critical weights.
|
||||
"""
|
||||
|
||||
# Default calibration data location
|
||||
CALIBRATION_DATA = Path("resources") / "imatrix_data.txt"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise imatrix generator."""
|
||||
self.binary_manager = BinaryManager()
|
||||
self.imatrix_binary = self._get_imatrix_binary()
|
||||
|
||||
def _get_imatrix_binary(self) -> Path | None:
|
||||
"""Get llama-imatrix binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if found, None otherwise.
|
||||
"""
|
||||
# First check local directory for manual placement
|
||||
local_binary = Path("./llama-imatrix")
|
||||
if local_binary.exists():
|
||||
logger.info(f"Using local llama-imatrix binary: {local_binary}")
|
||||
return local_binary
|
||||
|
||||
# Download from GitHub releases
|
||||
binary_path = self.binary_manager.get_imatrix_binary()
|
||||
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
||||
logger.info(f"Using llama-imatrix binary: {binary_path}")
|
||||
return binary_path
|
||||
|
||||
logger.warning("llama-imatrix binary not available")
|
||||
return None
|
||||
|
||||
def can_generate(self) -> bool:
|
||||
"""Check if imatrix generation is available.
|
||||
|
||||
Returns:
|
||||
True if binary and calibration data are available.
|
||||
"""
|
||||
return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists()
|
||||
|
||||
def generate_imatrix(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
output_path: Path,
|
||||
calibration_data: Path | None = None,
|
||||
) -> bool:
|
||||
"""Generate importance matrix for a model.
|
||||
|
||||
Returns:
|
||||
True if generation successful, False otherwise.
|
||||
"""
|
||||
validation_error = self._validate_generation_inputs(f16_model_path, calibration_data)
|
||||
if validation_error:
|
||||
logger.error(validation_error)
|
||||
return False
|
||||
|
||||
cal_data = calibration_data or self.CALIBRATION_DATA
|
||||
cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path)
|
||||
|
||||
self._log_generation_start(f16_model_path, cal_data, output_path)
|
||||
|
||||
return self._execute_imatrix_generation(cmd, output_path)
|
||||
|
||||
def _validate_generation_inputs(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
calibration_data: Path | None,
|
||||
) -> str | None:
|
||||
"""Validate inputs for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Error message if validation fails, None if valid.
|
||||
"""
|
||||
if not self.imatrix_binary:
|
||||
return "llama-imatrix binary not available"
|
||||
|
||||
if not f16_model_path.exists():
|
||||
return f"Model file not found: {f16_model_path}"
|
||||
|
||||
cal_data = calibration_data or self.CALIBRATION_DATA
|
||||
if not cal_data.exists():
|
||||
return f"Calibration data not found: {cal_data}"
|
||||
|
||||
return None
|
||||
|
||||
def _build_imatrix_command(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
cal_data: Path,
|
||||
output_path: Path,
|
||||
) -> list[str]:
|
||||
"""Build command for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Command list ready for subprocess execution.
|
||||
"""
|
||||
return [
|
||||
str(self.imatrix_binary),
|
||||
"-m",
|
||||
str(f16_model_path),
|
||||
"-f",
|
||||
str(cal_data),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--chunks",
|
||||
"128", # Process in chunks for stability
|
||||
]
|
||||
|
||||
def _log_generation_start(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
cal_data: Path,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Log the start of imatrix generation."""
|
||||
logger.info("🧮 Generating importance matrix...")
|
||||
logger.info(f"📊 Model: {f16_model_path.name}")
|
||||
logger.info(f"📝 Calibration data: {cal_data.name}")
|
||||
logger.info(f"💾 Output: {output_path.name}")
|
||||
|
||||
def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool:
|
||||
"""Execute the imatrix generation process.
|
||||
|
||||
Returns:
|
||||
True if generation completed successfully, False otherwise.
|
||||
"""
|
||||
# Set LD_LIBRARY_PATH for shared libraries
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.binary_manager.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
|
||||
self._stream_process_output(process)
|
||||
return self._handle_process_completion(process, output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Imatrix generation failed: {e}")
|
||||
return False
|
||||
|
||||
def _stream_process_output(self, process: subprocess.Popen[str]) -> None:
|
||||
"""Stream output from the running process."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
# Filter progress updates for cleaner output
|
||||
line = output.strip()
|
||||
if line and not line.startswith("["):
|
||||
logger.info(f" {line}")
|
||||
|
||||
def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool:
|
||||
"""Handle completion of the imatrix generation process.
|
||||
|
||||
Returns:
|
||||
True if process completed successfully and output exists, False otherwise.
|
||||
"""
|
||||
return_code = process.poll()
|
||||
if return_code != 0:
|
||||
logger.error(f"❌ Imatrix generation failed with return code {return_code}")
|
||||
return False
|
||||
|
||||
if not output_path.exists():
|
||||
logger.error("Generation completed but output file not found")
|
||||
return False
|
||||
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
logger.info(f"✅ Generated imatrix: {output_path.name} ({size_mb:.1f} MB)")
|
||||
return True
|
||||
|
||||
def prompt_for_generation(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
f16_model_path: Path,
|
||||
) -> Path | None:
|
||||
"""Prompt user to generate imatrix.
|
||||
|
||||
Interactively prompts the user to generate an importance matrix
|
||||
for enhanced quantisation quality using the model source information,
|
||||
directory, and F16 model path. Checks binary availability before prompting.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix or None if skipped.
|
||||
"""
|
||||
if not self.can_generate():
|
||||
logger.info("⚠️ Imatrix generation not available (missing binary or calibration data)")
|
||||
return None
|
||||
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("📊 Importance Matrix Generation")
|
||||
logger.info("=" * 70)
|
||||
logger.info(
|
||||
"\nImportance matrices improve quantisation quality by identifying"
|
||||
"\ncritical weights in the model. This process takes 5-10 minutes"
|
||||
"\nbut significantly improves the quality of smaller quantisations."
|
||||
)
|
||||
logger.info(f"\nModel: {model_source.model_name}")
|
||||
logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}")
|
||||
|
||||
response = input("\n❓ Generate importance matrix? (Y/n): ").strip().lower()
|
||||
|
||||
if response == "n":
|
||||
logger.info("Skipping imatrix generation")
|
||||
return None
|
||||
|
||||
# Generate imatrix
|
||||
output_path = model_dir / "imatrix.dat"
|
||||
logger.info("\n⏳ Generating importance matrix (this may take 5-10 minutes)...")
|
||||
|
||||
if self.generate_imatrix(f16_model_path, output_path):
|
||||
return output_path
|
||||
|
||||
logger.warning("Failed to generate imatrix, continuing without it")
|
||||
return None
|
219
helpers/llama_cpp/quantiser.py
Normal file
219
helpers/llama_cpp/quantiser.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
"""Direct llama.cpp quantisation execution.
|
||||
|
||||
Provides direct execution of llama.cpp quantisation binary with proper
|
||||
tensor-specific override support for L and XL variants.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.llama_cpp.binary_manager import BinaryManager
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import QuantisationConfig
|
||||
|
||||
|
||||
class QuantisationExecutor:
|
||||
"""Executes llama.cpp quantisation with tensor overrides.
|
||||
|
||||
Provides direct binary execution with proper command-line flags for
|
||||
tensor-specific overrides, supporting Bartowski-style L and XL variants.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation executor."""
|
||||
self.fs = FilesystemService()
|
||||
self.binary_manager = BinaryManager()
|
||||
self.quantise_binary = self._get_quantise_binary()
|
||||
self.last_error: str | None = None # Track last error type
|
||||
|
||||
def _get_quantise_binary(self) -> Path | None:
|
||||
"""Get llama-quantize binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if found, None otherwise.
|
||||
"""
|
||||
# First check local directory for manual placement
|
||||
local_binary = Path("./llama-quantize")
|
||||
if local_binary.exists():
|
||||
logger.info(f"Using local llama-quantize binary: {local_binary}")
|
||||
return local_binary
|
||||
|
||||
# Download from GitHub releases
|
||||
binary_path = self.binary_manager.get_quantise_binary()
|
||||
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
||||
logger.info(f"Using llama-quantize binary: {binary_path}")
|
||||
return binary_path
|
||||
|
||||
logger.error("Failed to obtain llama-quantize binary")
|
||||
logger.info(
|
||||
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
|
||||
)
|
||||
return None
|
||||
|
||||
def execute_quantisation(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None = None,
|
||||
) -> bool:
|
||||
"""Execute quantisation using llama.cpp binary.
|
||||
|
||||
Builds and executes llama-quantize command with proper tensor override
|
||||
flags for L and XL variants.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
if not self.quantise_binary:
|
||||
logger.error("llama-quantize binary not available")
|
||||
return False
|
||||
|
||||
# Build command
|
||||
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
|
||||
|
||||
# Execute with real-time output
|
||||
return self._execute_command(cmd)
|
||||
|
||||
def _build_quantisation_command(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None,
|
||||
) -> list[str]:
|
||||
"""Build llama-quantize command with tensor overrides.
|
||||
|
||||
Returns:
|
||||
Command arguments as list.
|
||||
"""
|
||||
cmd = [str(self.quantise_binary)]
|
||||
|
||||
# Add imatrix if available
|
||||
if imatrix_path:
|
||||
cmd.extend(["--imatrix", str(imatrix_path)])
|
||||
|
||||
# Add tensor overrides for L and XL variants
|
||||
if config.output_type:
|
||||
cmd.extend(["--output-tensor-type", config.output_type])
|
||||
if config.embedding_type:
|
||||
cmd.extend(["--token-embedding-type", config.embedding_type])
|
||||
|
||||
# Add input, output, and quantisation type
|
||||
cmd.extend([str(input_path), str(output_path), config.base_type])
|
||||
|
||||
return cmd
|
||||
|
||||
def _setup_environment(self) -> dict[str, str]:
|
||||
"""Set up environment variables for quantisation command.
|
||||
|
||||
Returns:
|
||||
Environment dictionary with necessary library paths.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.binary_manager.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
return env
|
||||
|
||||
def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]:
|
||||
"""Process subprocess output stream and detect errors.
|
||||
|
||||
Returns:
|
||||
Tuple of (output_lines, architecture_error_detected).
|
||||
"""
|
||||
output_lines = []
|
||||
architecture_error = False
|
||||
|
||||
if process.stdout:
|
||||
for line in iter(process.stdout.readline, ""):
|
||||
if line:
|
||||
cleaned_line = line.rstrip()
|
||||
output_lines.append(cleaned_line)
|
||||
logger.info(f" {cleaned_line}")
|
||||
|
||||
# Check for architecture errors
|
||||
if any(
|
||||
error_text in cleaned_line.lower()
|
||||
for error_text in [
|
||||
"unknown model architecture",
|
||||
"unsupported architecture",
|
||||
"unknown architecture",
|
||||
"architecture not supported",
|
||||
"model architecture",
|
||||
"llama_model_load: error loading model",
|
||||
]
|
||||
):
|
||||
architecture_error = True
|
||||
|
||||
return output_lines, architecture_error
|
||||
|
||||
def _handle_architecture_error(self, output_lines: list[str]) -> bool:
|
||||
"""Handle architecture-related errors by checking output.
|
||||
|
||||
Returns:
|
||||
True if architecture error was detected and handled.
|
||||
"""
|
||||
# Look for architecture info in recent output
|
||||
for line in output_lines[-10:]: # Check last 10 lines
|
||||
if "architecture" in line.lower():
|
||||
logger.error("❌ Architecture not supported by llama.cpp")
|
||||
logger.error(" so cannot be quantised with current llama.cpp but")
|
||||
logger.error(" F16 GGUF file can be used for inference if supported")
|
||||
# Store this for the orchestrator to detect
|
||||
self.last_error = "unsupported_architecture"
|
||||
return True
|
||||
return False
|
||||
|
||||
def _execute_command(self, cmd: list[str]) -> bool:
|
||||
"""Execute command with real-time output streaming.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"🔧 Executing: {' '.join(cmd)}")
|
||||
|
||||
env = self._setup_environment()
|
||||
|
||||
# Execute with real-time output
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
universal_newlines=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
output_lines, architecture_error = self._process_output_stream(process)
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
logger.info("✅ Quantisation successful!")
|
||||
return True
|
||||
|
||||
# Check if this was an architecture error
|
||||
if (architecture_error or return_code == 1) and self._handle_architecture_error(
|
||||
output_lines
|
||||
):
|
||||
return False
|
||||
|
||||
logger.error(f"❌ Quantisation failed with return code {return_code}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Quantisation failed with exception: {e}")
|
||||
|
||||
return False
|
|
@ -25,38 +25,37 @@ class QuantisationType(StrEnum):
|
|||
embeddings, attention layers, and feed-forward networks.
|
||||
"""
|
||||
|
||||
# Q2 variants (smallest, lowest quality)
|
||||
# Q2 variants
|
||||
Q2_0 = "Q2_0" # Basic 2-bit quantisation (flat, no K-quant optimisations)
|
||||
Q2_K = "Q2_K"
|
||||
Q2_K_S = "Q2_K_S"
|
||||
|
||||
# Q3 K-quants
|
||||
# Q3 variants
|
||||
Q3_0 = "Q3_0" # Basic 3-bit quantisation (flat, no K-quant optimisations)
|
||||
Q3_K_S = "Q3_K_S"
|
||||
Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
|
||||
Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline)
|
||||
Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)
|
||||
|
||||
# Q4 K-quants (most popular)
|
||||
# Q4 variants
|
||||
Q4_0 = "Q4_0" # Basic 4-bit quantisation (flat, no K-quant optimisations)
|
||||
Q4_1 = "Q4_1"
|
||||
Q4_K_S = "Q4_K_S"
|
||||
Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
|
||||
Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
|
||||
|
||||
# Q5 K-quants
|
||||
# Q5 variants
|
||||
Q5_0 = "Q5_0" # Basic 5-bit quantisation (flat, no K-quant optimisations)
|
||||
Q5_1 = "Q5_1"
|
||||
Q5_K_S = "Q5_K_S"
|
||||
Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
|
||||
Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
|
||||
|
||||
# Q6_K variants
|
||||
# Q6 variants
|
||||
Q6_0 = "Q6_0" # Basic 6-bit quantisation (flat, no K-quant optimisations)
|
||||
Q6_K = "Q6_K"
|
||||
Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)
|
||||
|
||||
# Q8_0 (highest common quantisation)
|
||||
Q8_0 = "Q8_0"
|
||||
|
||||
# Legacy quantisation formats
|
||||
Q4_0 = "Q4_0"
|
||||
Q4_1 = "Q4_1"
|
||||
Q5_0 = "Q5_0"
|
||||
Q5_1 = "Q5_1"
|
||||
# Q8 variants
|
||||
Q8_0 = "Q8_0" # Basic 8-bit quantisation (flat, no K-quant optimisations)
|
||||
Q8_K = "Q8_K" # K-quant 8-bit (optimised by llama.cpp)
|
||||
# F16 variants
|
||||
F16 = "F16" # F16 quantisation
|
||||
|
||||
|
||||
class URLType(StrEnum):
|
||||
|
@ -102,7 +101,12 @@ class QuantisationConfig(BaseModel):
|
|||
Dictionary mapping layer types to quantisation specifications for display.
|
||||
"""
|
||||
# Build base quantisation string from precision
|
||||
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
|
||||
# For basic types (Q4_0, Q5_0, Q6_0, Q8_0), use the actual base_type
|
||||
# For K-quants, build from precision
|
||||
if self.base_type in {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}:
|
||||
base = self.base_type
|
||||
else:
|
||||
base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
|
||||
|
||||
# Get inherent enhancements for display - inherit from base type if this is L/XL variant
|
||||
enhancements = self.inherent_enhancements or {}
|
||||
|
@ -166,10 +170,9 @@ class QuantisationConfig(BaseModel):
|
|||
== layers["gate_up"]
|
||||
== layers["down"]
|
||||
):
|
||||
if self.name == "Q6_K":
|
||||
return "Q6_K all layers"
|
||||
if self.name == "Q8_0":
|
||||
return "Q8_0 all layers"
|
||||
# For basic types and uniform K-quants, use the actual name
|
||||
if self.name in {"Q4_0", "Q5_0", "Q6_0", "Q8_0", "Q6_K", "Q8_K"}:
|
||||
return f"{self.name} all layers"
|
||||
return f"{layers['embed']} all layers"
|
||||
|
||||
# Build component groups
|
||||
|
|
23
helpers/quantisation/__init__.py
Normal file
23
helpers/quantisation/__init__.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
"""Quantisation orchestration and workflow management.
|
||||
|
||||
Provides high-level orchestration of the quantisation workflow,
|
||||
including execution, progress tracking, and profile management.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.quantisation.engine import QuantisationEngine
|
||||
from helpers.quantisation.executor import QuantisationExecutor
|
||||
from helpers.quantisation.model_manager import ModelManager
|
||||
from helpers.quantisation.orchestrator import QuantisationOrchestrator
|
||||
from helpers.quantisation.profile_manager import ProfileManager
|
||||
from helpers.quantisation.progress import ProgressReporter
|
||||
|
||||
__all__ = [
|
||||
"ModelManager",
|
||||
"ProfileManager",
|
||||
"ProgressReporter",
|
||||
"QuantisationEngine",
|
||||
"QuantisationExecutor",
|
||||
"QuantisationOrchestrator",
|
||||
]
|
141
helpers/quantisation/engine.py
Normal file
141
helpers/quantisation/engine.py
Normal file
|
@ -0,0 +1,141 @@
|
|||
"""Quantisation engine for model processing.
|
||||
|
||||
Handles the actual quantisation process with configurable methods,
|
||||
supporting multiple quantisation backends and fallback strategies.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.ggml import GGMLQuantiser
|
||||
from helpers.llama_cpp import QuantisationExecutor
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationResult, QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import (
|
||||
QuantisationContext,
|
||||
)
|
||||
|
||||
|
||||
class QuantisationEngine:
|
||||
"""Handles the actual quantisation process with configurable methods.
|
||||
|
||||
Provides flexible quantisation execution supporting multiple tensor
|
||||
precision configurations, importance matrices, and fallback strategies.
|
||||
Uses direct llama.cpp binary execution with proper tensor overrides.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation engine."""
|
||||
self.fs = FilesystemService()
|
||||
self.executor = QuantisationExecutor()
|
||||
self.ggml_quantiser = GGMLQuantiser()
|
||||
|
||||
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
||||
"""Perform quantisation using the specified configuration.
|
||||
|
||||
Executes quantisation using direct llama.cpp binary with proper
|
||||
tensor override flags for L and XL variants. Falls back to GGML
|
||||
for basic types when architecture is unsupported. Processes the
|
||||
quantisation context containing all required parameters and settings.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with success status and file information.
|
||||
"""
|
||||
logger.info(
|
||||
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
||||
)
|
||||
|
||||
output_path = context.get_output_path()
|
||||
|
||||
# Check input file exists and is readable
|
||||
if not context.f16_model_path.exists():
|
||||
error_msg = f"Input model file does not exist: {context.f16_model_path}"
|
||||
logger.error(f"❌ {error_msg}")
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message=error_msg,
|
||||
)
|
||||
|
||||
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
||||
logger.info(f"📝 Source: {context.f16_model_path}")
|
||||
logger.info(f"📝 Target: {output_path}")
|
||||
|
||||
# Determine if this is a basic type that can use GGML
|
||||
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
||||
is_basic_type = context.config.name in basic_types
|
||||
|
||||
try:
|
||||
# Try llama.cpp first for all types
|
||||
logger.info("🔧 Using llama.cpp binary for quantisation...")
|
||||
|
||||
success = self.executor.execute_quantisation(
|
||||
context.f16_model_path, output_path, context.config, context.imatrix_path
|
||||
)
|
||||
|
||||
if success:
|
||||
return self._create_success_result(context.config.name, output_path, "llama.cpp")
|
||||
|
||||
# Check if this was an architecture error and we can use GGML fallback
|
||||
if (
|
||||
hasattr(self.executor, "last_error")
|
||||
and self.executor.last_error == "unsupported_architecture"
|
||||
and is_basic_type
|
||||
):
|
||||
logger.info("🔄 Architecture unsupported - using GGML implementation...")
|
||||
|
||||
success = self.ggml_quantiser.try_alternative_quantisation(
|
||||
context.f16_model_path, output_path, context.config.name
|
||||
)
|
||||
|
||||
if success:
|
||||
return self._create_success_result(
|
||||
context.config.name, output_path, "GGML numpy"
|
||||
)
|
||||
|
||||
logger.error(f"❌ {context.config.name} quantisation failed")
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message="Quantisation failed via Python API",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message=f"Exception during quantisation: {e!s}",
|
||||
)
|
||||
|
||||
def _create_success_result(
|
||||
self, quant_type: str, output_path: Path, method_used: str
|
||||
) -> QuantisationResult:
|
||||
"""Create successful quantisation result with file metadata.
|
||||
|
||||
Constructs a successful quantisation result containing file size
|
||||
information and method details. Uses the quantisation type, output
|
||||
path, and method information to generate comprehensive result metadata.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with file path and size information.
|
||||
"""
|
||||
file_size = self.fs.get_file_size(output_path)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(quant_type),
|
||||
success=True,
|
||||
file_path=output_path,
|
||||
file_size=file_size,
|
||||
method_used=method_used,
|
||||
)
|
457
helpers/quantisation/executor.py
Normal file
457
helpers/quantisation/executor.py
Normal file
|
@ -0,0 +1,457 @@
|
|||
"""Quantisation execution management.
|
||||
|
||||
Handles the execution of quantisation operations including parallel
|
||||
uploads, status tracking, and error handling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.quantisation.progress import ProgressReporter
|
||||
from helpers.utils.rate_limiter import ReadmeRateLimiter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.filesystem import FileCleanup
|
||||
from helpers.huggingface import HuggingFaceUploader
|
||||
from helpers.models.quantisation import ModelSource
|
||||
from helpers.quantisation.engine import QuantisationEngine
|
||||
from helpers.readme import ReadmeGenerator
|
||||
|
||||
|
||||
class QuantisationExecutor:
|
||||
"""Executes quantisation operations with parallel upload support.
|
||||
|
||||
Manages the execution of multiple quantisations with background
|
||||
uploads, status tracking, and proper error handling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quantisation_engine: QuantisationEngine,
|
||||
uploader: HuggingFaceUploader,
|
||||
readme_generator: ReadmeGenerator,
|
||||
file_cleanup: FileCleanup,
|
||||
no_upload: bool = False,
|
||||
) -> None:
|
||||
"""Initialise quantisation executor.
|
||||
|
||||
Sets up the quantisation executor with all required service dependencies
|
||||
for performing quantisations, uploading results, generating documentation,
|
||||
and cleaning up temporary files. Configures upload behaviour based on settings.
|
||||
"""
|
||||
self.quantisation_engine = quantisation_engine
|
||||
self.uploader = uploader
|
||||
self.readme_generator = readme_generator
|
||||
self.file_cleanup = file_cleanup
|
||||
self.no_upload = no_upload
|
||||
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
|
||||
self.progress_reporter = ProgressReporter()
|
||||
|
||||
def execute_quantisations(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
quantisation_types: list[QuantisationType],
|
||||
models_dir: Path,
|
||||
) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Execute all quantisation types with parallel uploads.
|
||||
|
||||
Orchestrates the complete quantisation workflow including F16 processing,
|
||||
multiple quantisation type execution, parallel upload management, and
|
||||
README generation. Handles all aspects of the quantisation pipeline
|
||||
from initial setup through final documentation.
|
||||
|
||||
Returns:
|
||||
Dictionary of quantisation results by type.
|
||||
"""
|
||||
results: dict[QuantisationType, QuantisationResult] = {}
|
||||
|
||||
# Track F16 in results if we converted from SafeTensors
|
||||
if not model_source.is_gguf_repo:
|
||||
results[QuantisationType.F16] = self._create_f16_result(f16_model_path)
|
||||
|
||||
# Process with parallel uploads
|
||||
upload_futures: list[Any] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
|
||||
# Start F16 upload if applicable
|
||||
if (
|
||||
not model_source.is_gguf_repo
|
||||
and not self.no_upload
|
||||
and QuantisationType.F16 in results
|
||||
):
|
||||
self._start_f16_upload(
|
||||
results,
|
||||
model_source,
|
||||
output_repo,
|
||||
f16_model_path,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
|
||||
# Process each quantisation
|
||||
for i, quant_type in enumerate(quantisation_types, 1):
|
||||
# Skip if already marked as failed
|
||||
if quant_type in results and results[quant_type].status == "failed":
|
||||
logger.info(
|
||||
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
|
||||
)
|
||||
continue
|
||||
|
||||
self.progress_reporter.print_quantisation_start(
|
||||
i, len(quantisation_types), quant_type.value
|
||||
)
|
||||
|
||||
try:
|
||||
result = self._process_single_quantisation(
|
||||
quant_type,
|
||||
model_source,
|
||||
f16_model_path,
|
||||
imatrix_path,
|
||||
output_repo,
|
||||
results,
|
||||
models_dir,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
results[quant_type] = result
|
||||
|
||||
# Force cleanup between quantisations
|
||||
gc.collect()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
|
||||
results[quant_type] = QuantisationResult(
|
||||
quantisation_type=quant_type,
|
||||
success=False,
|
||||
status="failed",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
# Force cleanup after error
|
||||
gc.collect()
|
||||
|
||||
# Wait for all uploads to complete
|
||||
self._wait_for_uploads(upload_futures)
|
||||
|
||||
# Final README update
|
||||
if not self.no_upload and upload_futures:
|
||||
self._final_readme_update(model_source, results, models_dir, output_repo)
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_quantisation(
|
||||
self,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> QuantisationResult:
|
||||
"""Process a single quantisation type.
|
||||
|
||||
Returns:
|
||||
QuantisationResult for the processed type.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting {quant_type.value} quantisation...")
|
||||
config = QUANTISATION_CONFIGS[quant_type]
|
||||
|
||||
# Create initial result and update status
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "processing"
|
||||
results[quant_type] = result
|
||||
|
||||
self._update_readme_status(model_source, results, models_dir, output_repo)
|
||||
|
||||
# Perform quantisation
|
||||
context = QuantisationContext(
|
||||
f16_model_path=f16_model_path,
|
||||
model_source=model_source,
|
||||
config=config,
|
||||
models_dir=models_dir,
|
||||
imatrix_path=imatrix_path,
|
||||
)
|
||||
result = self.quantisation_engine.quantise(context)
|
||||
|
||||
# Handle result
|
||||
if result.success and result.file_path:
|
||||
self._start_parallel_upload(
|
||||
result,
|
||||
quant_type,
|
||||
output_repo,
|
||||
model_source,
|
||||
results,
|
||||
models_dir,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
else:
|
||||
result.status = "failed"
|
||||
self._update_readme_status(model_source, results, models_dir, output_repo)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {quant_type.value}: {e}")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "failed"
|
||||
result.error_message = str(e)
|
||||
|
||||
try:
|
||||
self._update_readme_status(model_source, results, models_dir, output_repo)
|
||||
except Exception as readme_error:
|
||||
logger.error(f"Failed to update README after error: {readme_error}")
|
||||
|
||||
return result
|
||||
|
||||
def _start_parallel_upload(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> None:
|
||||
"""Start parallel upload of quantisation result."""
|
||||
if self.no_upload or not result.file_path:
|
||||
return
|
||||
|
||||
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
||||
logger.info(f"Starting parallel upload of {quant_str}...")
|
||||
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_and_cleanup,
|
||||
output_repo,
|
||||
result.file_path,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
models_dir,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
|
||||
result.file_path = None # Mark as being uploaded
|
||||
result.status = "uploading"
|
||||
self._update_readme_status(model_source, results, models_dir, output_repo)
|
||||
|
||||
def _upload_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
) -> None:
|
||||
"""Upload file and clean up (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
|
||||
|
||||
self.file_cleanup.cleanup_quantisation_file(file_path)
|
||||
|
||||
results[quant_type].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
||||
results[quant_type].status = "failed"
|
||||
results[quant_type].error_message = str(e)
|
||||
|
||||
try:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
except Exception as readme_error:
|
||||
logger.error(
|
||||
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
|
||||
)
|
||||
|
||||
def _start_f16_upload(
|
||||
self,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
model_source: ModelSource,
|
||||
output_repo: str,
|
||||
f16_model_path: Path,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> None:
|
||||
"""Start F16 upload in background."""
|
||||
f16_result = results[QuantisationType.F16]
|
||||
if f16_result.file_path and f16_result.file_path.exists():
|
||||
logger.info("Starting parallel upload of F16 GGUF...")
|
||||
f16_result.status = "uploading"
|
||||
self._update_readme_status(
|
||||
model_source, results, f16_model_path.parent.parent, output_repo
|
||||
)
|
||||
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_f16_and_cleanup,
|
||||
output_repo,
|
||||
f16_result.file_path,
|
||||
model_source,
|
||||
results,
|
||||
f16_model_path.parent.parent,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
|
||||
def _upload_f16_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
) -> None:
|
||||
"""Upload F16 file and update status (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
|
||||
|
||||
# Don't delete F16 yet - still needed for quantisations
|
||||
|
||||
results[QuantisationType.F16].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info("[PARALLEL] F16 upload complete")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
|
||||
results[QuantisationType.F16].status = "failed"
|
||||
results[QuantisationType.F16].error_message = str(e)
|
||||
|
||||
try:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
except Exception as readme_error:
|
||||
logger.error(
|
||||
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
|
||||
)
|
||||
|
||||
def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult:
|
||||
"""Create a result object for F16 tracking.
|
||||
|
||||
Returns:
|
||||
QuantisationResult object for F16 tracking.
|
||||
"""
|
||||
f16_size = "-"
|
||||
if f16_model_path.exists():
|
||||
size_bytes = f16_model_path.stat().st_size
|
||||
size_gb = size_bytes / (1024**3)
|
||||
f16_size = f"{size_gb:.1f}GB"
|
||||
|
||||
# Create a simple result object for F16 tracking
|
||||
return type(
|
||||
"F16Result",
|
||||
(),
|
||||
{
|
||||
"quantisation_type": "F16",
|
||||
"success": True,
|
||||
"status": "planned",
|
||||
"file_path": f16_model_path,
|
||||
"file_size": f16_size,
|
||||
},
|
||||
)()
|
||||
|
||||
def _wait_for_uploads(self, upload_futures: list) -> None:
|
||||
"""Wait for all parallel uploads to complete."""
|
||||
if not upload_futures:
|
||||
return
|
||||
|
||||
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
|
||||
completed = 0
|
||||
failed = 0
|
||||
|
||||
for future in upload_futures:
|
||||
try:
|
||||
future.result(timeout=300) # 5 minute timeout per upload
|
||||
completed += 1
|
||||
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
|
||||
|
||||
self.progress_reporter.print_upload_summary(completed, failed)
|
||||
|
||||
def _update_readme_status(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Update README with current quantisation status using rate limiting."""
|
||||
if not self.no_upload:
|
||||
# Use rate limiter to batch updates
|
||||
self.readme_limiter.request_update(
|
||||
self._do_readme_update,
|
||||
model_source,
|
||||
results,
|
||||
models_dir,
|
||||
output_repo,
|
||||
)
|
||||
|
||||
def _do_readme_update(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Actually perform the README update (called by rate limiter)."""
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
def _final_readme_update(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Perform final README update after all operations."""
|
||||
logger.info("Updating README with final status...")
|
||||
final_readme = self.readme_generator.generate(
|
||||
model_source, results, models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, final_readme)
|
422
helpers/quantisation/model_manager.py
Normal file
422
helpers/quantisation/model_manager.py
Normal file
|
@ -0,0 +1,422 @@
|
|||
"""Model acquisition and preparation management.
|
||||
|
||||
Handles model downloading from HuggingFace and preparation for quantisation,
|
||||
including format detection and conversion.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.gguf import GGUFConverter
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import ModelSource
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ModelManager:
|
||||
"""Handles model downloading and preparation for quantisation.
|
||||
|
||||
Manages both GGUF repository downloads and HuggingFace model conversions,
|
||||
providing unified interface for model acquisition and preparation.
|
||||
"""
|
||||
|
||||
def __init__(self, models_dir: Path) -> None:
|
||||
"""Initialise model manager with storage configuration.
|
||||
|
||||
Creates a new model manager instance that will handle model downloading,
|
||||
format detection, and preparation for quantisation workflows using the
|
||||
specified directory as the base storage location.
|
||||
"""
|
||||
self.models_dir = models_dir
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def prepare_model(self, model_source: ModelSource) -> Path:
|
||||
"""Prepare model for quantisation and return F16 model path.
|
||||
|
||||
Handles both GGUF repository downloads and regular HuggingFace model
|
||||
conversion workflows with automatic format detection. Processes the
|
||||
provided model source information to determine the optimal acquisition
|
||||
strategy and ensures the model is in F16 GGUF format.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model ready for quantisation.
|
||||
"""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
if model_source.is_gguf_repo:
|
||||
return self._handle_gguf_repo(model_source, model_dir)
|
||||
return self._handle_regular_repo(model_source, model_dir)
|
||||
|
||||
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Handle GGUF repository download with pattern matching.
|
||||
|
||||
Downloads GGUF files matching specified patterns, prioritising
|
||||
multi-part files and F16 variants. Uses the model source information
|
||||
and target directory to efficiently locate and download appropriate
|
||||
GGUF files from HuggingFace repositories.
|
||||
|
||||
Returns:
|
||||
Path to downloaded or existing GGUF file.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
||||
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
||||
|
||||
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
||||
return f16_model
|
||||
|
||||
# Check for existing GGUF files
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
existing_gguf = self.fs.find_gguf_files(model_dir)
|
||||
|
||||
if existing_gguf:
|
||||
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
|
||||
return existing_gguf[0]
|
||||
|
||||
# Download with patterns
|
||||
downloaded_file = self._download_gguf_with_patterns(
|
||||
model_source.source_model, model_source.gguf_file_pattern, model_dir
|
||||
)
|
||||
|
||||
if downloaded_file:
|
||||
# Handle multi-part files
|
||||
if "00001-of-" in downloaded_file.name:
|
||||
return downloaded_file
|
||||
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
|
||||
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
|
||||
"-00003-of-", "-00001-of-"
|
||||
)
|
||||
first_part = downloaded_file.parent / base_name
|
||||
if first_part.exists():
|
||||
logger.info(f"🔄 Using first part: {first_part.name}")
|
||||
return first_part
|
||||
|
||||
# Rename single file to standard name
|
||||
downloaded_file.rename(f16_model)
|
||||
return f16_model
|
||||
|
||||
# Fallback to regular conversion
|
||||
logger.info("💡 Falling back to downloading full repository and converting...")
|
||||
return self._handle_regular_repo(
|
||||
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
|
||||
model_dir,
|
||||
)
|
||||
|
||||
def _download_gguf_with_patterns(
|
||||
self, source_model: str, pattern: str | None, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Download GGUF file using various pattern strategies.
|
||||
|
||||
Tries multiple pattern variations to find and download appropriate
|
||||
GGUF files, handling timeouts and temporary directories. Uses the
|
||||
HuggingFace model identifier with an optional pattern to search for
|
||||
specific files and downloads them to the target directory.
|
||||
|
||||
Returns:
|
||||
Path to downloaded file, or None if all patterns fail.
|
||||
"""
|
||||
if pattern:
|
||||
patterns = [
|
||||
f"*{pattern}*",
|
||||
f"*{pattern.lower()}*",
|
||||
f"*{pattern.upper()}*",
|
||||
"*f16*",
|
||||
"*F16*",
|
||||
"*fp16*",
|
||||
]
|
||||
else:
|
||||
patterns = ["*f16*", "*F16*", "*fp16*"]
|
||||
|
||||
temp_dir = model_dir / "gguf_temp"
|
||||
|
||||
for search_pattern in patterns:
|
||||
logger.info(f"🔍 Trying pattern: {search_pattern}")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
logger.debug(
|
||||
f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
|
||||
)
|
||||
result = subprocess.run(
|
||||
[
|
||||
"timeout",
|
||||
"300",
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--include",
|
||||
search_pattern,
|
||||
"--local-dir",
|
||||
str(temp_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.debug(
|
||||
f"DEBUG: Download command completed with return code {result.returncode}"
|
||||
)
|
||||
|
||||
# Find downloaded GGUF files
|
||||
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
|
||||
if gguf_files:
|
||||
found_file = gguf_files[0]
|
||||
logger.info(f"✅ Found GGUF file: {found_file.name}")
|
||||
|
||||
# Move to parent directory
|
||||
final_path = model_dir / found_file.name
|
||||
shutil.move(str(found_file), str(final_path))
|
||||
shutil.rmtree(temp_dir)
|
||||
return final_path
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.debug(
|
||||
f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
|
||||
)
|
||||
if e.stderr:
|
||||
logger.debug(f"DEBUG: stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.debug(f"DEBUG: stdout: {e.stdout}")
|
||||
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error during download: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
continue
|
||||
finally:
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return None
|
||||
|
||||
def _handle_regular_repo(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
) -> Path:
|
||||
"""Handle regular HuggingFace repository conversion.
|
||||
|
||||
Downloads full model repository and converts to F16 GGUF format
|
||||
using our native Python-based GGUFConverter for SafeTensors models.
|
||||
Processes the model source information and uses the local directory
|
||||
for storage during the download and conversion workflow.
|
||||
|
||||
Returns:
|
||||
Path to converted F16 GGUF model.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
|
||||
|
||||
# Download model if needed
|
||||
if not model_dir.exists():
|
||||
self._download_repository(model_source.source_model, model_dir)
|
||||
else:
|
||||
logger.info("✅ Model already downloaded")
|
||||
|
||||
# Convert to GGUF
|
||||
return self._convert_to_gguf(model_source, model_dir)
|
||||
|
||||
def _setup_download_directories(self, model_dir: Path) -> None:
|
||||
"""Set up directories for model download.
|
||||
|
||||
Creates the necessary directory structure for model downloads,
|
||||
including the base model directory and HuggingFace metadata
|
||||
directory to ensure proper organisation of downloaded assets.
|
||||
"""
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
huggingface_dir = model_dir / ".huggingface"
|
||||
huggingface_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _create_download_process(self, source_model: str, model_dir: Path) -> subprocess.Popen:
|
||||
"""Create subprocess for downloading repository.
|
||||
|
||||
Initiates a HuggingFace CLI download process for the specified model
|
||||
identifier, configuring it to download to the local directory whilst
|
||||
excluding existing GGUF files to avoid conflicts.
|
||||
|
||||
Returns:
|
||||
Subprocess for downloading.
|
||||
"""
|
||||
return subprocess.Popen(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--local-dir",
|
||||
str(model_dir),
|
||||
"--exclude",
|
||||
"*.gguf",
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1, # Line buffered
|
||||
universal_newlines=True,
|
||||
)
|
||||
|
||||
def _stream_download_output(self, process: subprocess.Popen) -> None:
|
||||
"""Stream download process output with appropriate logging levels.
|
||||
|
||||
Monitors the download subprocess output and routes progress information
|
||||
to appropriate log levels, providing real-time feedback on download
|
||||
progress whilst filtering debug information appropriately.
|
||||
"""
|
||||
if process.stdout:
|
||||
for line in process.stdout:
|
||||
# Log download progress lines
|
||||
if line.strip():
|
||||
# Check if it's a progress line (contains %)
|
||||
if "%" in line or "Downloading" in line or "Fetching" in line:
|
||||
# Use info level for progress lines
|
||||
logger.info(f" {line.strip()}")
|
||||
else:
|
||||
# Use debug for other output
|
||||
logger.debug(f" {line.strip()}")
|
||||
|
||||
def _handle_download_errors(self, source_model: str, e: Exception) -> None:
|
||||
"""Handle download errors with detailed logging.
|
||||
|
||||
Processes download exceptions for the specified model, providing
|
||||
comprehensive error logging including return codes, stderr, and
|
||||
stdout information to aid in debugging download failures.
|
||||
|
||||
Raises:
|
||||
TypeError: Always raised with appropriate error message.
|
||||
"""
|
||||
if isinstance(e, subprocess.CalledProcessError):
|
||||
logger.error(f"❌ Failed to download repository {source_model}")
|
||||
logger.error(f"Return code: {e.returncode}")
|
||||
if e.stderr:
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
msg = f"Repository download failed: {e}"
|
||||
raise TypeError(msg) from e
|
||||
logger.error(f"❌ Unexpected error during repository download: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
msg = f"Repository download failed: {e}"
|
||||
raise TypeError(msg) from e
|
||||
|
||||
def _download_repository(self, source_model: str, model_dir: Path) -> None:
|
||||
"""Download HuggingFace repository.
|
||||
|
||||
Orchestrates the complete repository download workflow for the
|
||||
specified HuggingFace model, managing directory setup, process
|
||||
execution, and error handling to ensure robust model acquisition.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If download fails.
|
||||
"""
|
||||
self._setup_download_directories(model_dir)
|
||||
|
||||
try:
|
||||
logger.info(f"⬇️ Downloading full repository: {source_model}")
|
||||
logger.info("📊 Progress will be shown below...")
|
||||
|
||||
process = self._create_download_process(source_model, model_dir)
|
||||
self._stream_download_output(process)
|
||||
|
||||
# Wait for process to complete
|
||||
return_code = process.wait()
|
||||
|
||||
if return_code != 0:
|
||||
msg = f"Repository download failed with return code {return_code}"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("✅ Repository download completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
self._handle_download_errors(source_model, e)
|
||||
|
||||
def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Convert model to GGUF F16 format.
|
||||
|
||||
Converts SafeTensors models to GGUF F16 format using our native
|
||||
Python converter. Processes model source information and the
|
||||
directory containing downloaded model files, handling architecture
|
||||
detection and tensor mapping for optimal compatibility.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If conversion fails.
|
||||
"""
|
||||
logger.info("🔄 Converting to GGUF F16 format...")
|
||||
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info("✅ F16 model already exists")
|
||||
return f16_model
|
||||
|
||||
# Check for SafeTensors files
|
||||
safetensor_files = list(model_dir.glob("*.safetensors"))
|
||||
if not safetensor_files:
|
||||
logger.error("❌ Model format not supported")
|
||||
logger.info("💡 This tool supports GGUF and SafeTensors formats")
|
||||
msg = "Model must be in GGUF or SafeTensors format"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("🐍 Using native Python GGUFConverter...")
|
||||
logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
|
||||
|
||||
# Load model configuration
|
||||
config_parser = ConfigParser()
|
||||
model_config = config_parser.load_model_config(model_dir)
|
||||
|
||||
# Get architecture mapping
|
||||
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
|
||||
arch = config_parser.get_architecture_mapping(arch_name)
|
||||
|
||||
if arch != arch_name:
|
||||
logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
|
||||
|
||||
# Check if architecture is supported by llama.cpp
|
||||
supported_archs = {
|
||||
"llama",
|
||||
"qwen2",
|
||||
"gemma",
|
||||
"phi3",
|
||||
"falcon",
|
||||
"gpt2",
|
||||
"gptj",
|
||||
"gptneox",
|
||||
"mpt",
|
||||
"baichuan",
|
||||
"stablelm",
|
||||
}
|
||||
|
||||
if arch not in supported_archs:
|
||||
logger.warning("=" * 70)
|
||||
logger.warning(f"⚠️ Architecture '{arch_name}' may not be supported by llama.cpp")
|
||||
logger.warning(f"⚠️ The GGUF will be created with architecture: '{arch}'")
|
||||
logger.warning("⚠️ Check if your inference software supports this architecture.")
|
||||
logger.warning("=" * 70)
|
||||
|
||||
# Convert using GGUFConverter
|
||||
tensor_mapper = TensorMapper()
|
||||
success = GGUFConverter.convert_safetensors(
|
||||
model_dir, f16_model, model_config, arch, tensor_mapper
|
||||
)
|
||||
|
||||
if not success:
|
||||
logger.error("❌ Native Python conversion failed")
|
||||
msg = "Failed to convert SafeTensors model to GGUF"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("✅ Native Python conversion successful")
|
||||
return f16_model
|
229
helpers/quantisation/orchestrator.py
Normal file
229
helpers/quantisation/orchestrator.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
"""Main quantisation orchestrator.
|
||||
|
||||
Provides the high-level orchestration of the complete quantisation
|
||||
workflow, coordinating between various services and modules.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import signal
|
||||
import sys
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.filesystem import FileCleanup, WorkspaceManager
|
||||
from helpers.huggingface import HuggingFaceUploader
|
||||
from helpers.llama_cpp import IMatrixGenerator, IMatrixHandler
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationResult, QuantisationType
|
||||
from helpers.quantisation.engine import QuantisationEngine
|
||||
from helpers.quantisation.executor import QuantisationExecutor
|
||||
from helpers.quantisation.model_manager import ModelManager
|
||||
from helpers.quantisation.profile_manager import ProfileManager
|
||||
from helpers.quantisation.progress import ProgressReporter
|
||||
from helpers.readme import ReadmeGenerator
|
||||
from helpers.utils.rate_limiter import ReadmeRateLimiter
|
||||
from helpers.utils.tensor_mapping import URLParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import FrameType
|
||||
|
||||
from helpers.models.quantisation import ModelSource
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class QuantisationOrchestrator:
|
||||
"""Orchestrates the complete quantisation workflow.
|
||||
|
||||
Thin coordinator that delegates to specialised services for
|
||||
each aspect of the quantisation workflow.
|
||||
"""
|
||||
|
||||
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
||||
use_imatrix: bool = True
|
||||
no_upload: bool = False
|
||||
custom_profiles: list[str] | None = None
|
||||
|
||||
# Service dependencies
|
||||
url_parser: URLParser = field(default_factory=URLParser)
|
||||
workspace_manager: WorkspaceManager = field(init=False)
|
||||
model_manager: ModelManager = field(init=False)
|
||||
profile_manager: ProfileManager = field(default_factory=ProfileManager)
|
||||
progress_reporter: ProgressReporter = field(default_factory=ProgressReporter)
|
||||
quantisation_executor: QuantisationExecutor = field(init=False)
|
||||
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
|
||||
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
||||
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
||||
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
||||
file_cleanup: FileCleanup = field(default_factory=FileCleanup)
|
||||
readme_limiter: ReadmeRateLimiter = field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialise computed properties after dataclass construction."""
|
||||
self.workspace_manager = WorkspaceManager(self.work_dir)
|
||||
self.model_manager = ModelManager(self.workspace_manager.models_dir)
|
||||
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
|
||||
|
||||
# Create executor with dependencies
|
||||
self.quantisation_executor = QuantisationExecutor(
|
||||
quantisation_engine=QuantisationEngine(),
|
||||
uploader=self.uploader,
|
||||
readme_generator=self.readme_generator,
|
||||
file_cleanup=self.file_cleanup,
|
||||
no_upload=self.no_upload,
|
||||
)
|
||||
|
||||
# Set up signal handlers
|
||||
self._setup_signal_handlers()
|
||||
|
||||
def _setup_signal_handlers(self) -> None:
|
||||
"""Set up signal handlers to catch unexpected exits."""
|
||||
|
||||
def signal_handler(signum: int, frame: FrameType | None) -> None:
|
||||
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
|
||||
logger.error("Stack trace at signal:")
|
||||
if frame:
|
||||
for line in traceback.format_stack(frame):
|
||||
logger.error(f" {line.strip()}")
|
||||
logger.error("Exiting due to signal")
|
||||
sys.exit(1)
|
||||
|
||||
# Handle common termination signals
|
||||
for sig in [signal.SIGINT, signal.SIGTERM]:
|
||||
signal.signal(sig, signal_handler)
|
||||
|
||||
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
||||
|
||||
Coordinates the complete quantisation process from URL parsing through
|
||||
model downloading, quantisation execution, and upload to HuggingFace.
|
||||
Handles architecture compatibility and provides comprehensive error handling.
|
||||
|
||||
Returns:
|
||||
Dictionary of quantisation results by type.
|
||||
|
||||
Raises:
|
||||
KeyboardInterrupt: If the user interrupts the quantisation process.
|
||||
"""
|
||||
logger.info("Starting Bartowski quantisation process...")
|
||||
logger.debug(f"DEBUG: Input URL: {url}")
|
||||
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
|
||||
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
|
||||
logger.debug(f"DEBUG: No upload: {self.no_upload}")
|
||||
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
|
||||
|
||||
try:
|
||||
# Setup and preparation
|
||||
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
|
||||
|
||||
# Create initial repository
|
||||
self._create_initial_repository(model_source, output_repo)
|
||||
|
||||
# Get quantisation types
|
||||
quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles)
|
||||
|
||||
# Filter by architecture if needed
|
||||
supported_types, unsupported_types = self.profile_manager.filter_by_architecture(
|
||||
quantisation_types, f16_model_path
|
||||
)
|
||||
|
||||
# Pre-mark unsupported types
|
||||
results: dict[QuantisationType, QuantisationResult] = {}
|
||||
for quant_type in unsupported_types:
|
||||
results[quant_type] = QuantisationResult(
|
||||
quantisation_type=quant_type,
|
||||
success=False,
|
||||
status="failed",
|
||||
error_message="K-quant requires llama.cpp architecture support",
|
||||
)
|
||||
|
||||
# Execute quantisations
|
||||
execution_results = self.quantisation_executor.execute_quantisations(
|
||||
model_source,
|
||||
f16_model_path,
|
||||
imatrix_path,
|
||||
output_repo,
|
||||
supported_types,
|
||||
self.workspace_manager.models_dir,
|
||||
)
|
||||
results.update(execution_results)
|
||||
|
||||
# Cleanup
|
||||
self.file_cleanup.cleanup_files(
|
||||
f16_model_path, model_source, self.workspace_manager.models_dir
|
||||
)
|
||||
|
||||
# Print summary
|
||||
self.progress_reporter.print_completion_summary(model_source, results, output_repo)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.error("❌ Process interrupted by user (Ctrl+C)")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error in quantisation workflow: {e}")
|
||||
logger.error("Full traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
finally:
|
||||
# Always flush pending README updates before exiting
|
||||
self.readme_limiter.flush()
|
||||
|
||||
return results
|
||||
|
||||
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
|
||||
"""Setup environment and prepare model for quantisation.
|
||||
|
||||
Returns:
|
||||
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
|
||||
"""
|
||||
model_source = self.url_parser.parse(url)
|
||||
self.progress_reporter.print_model_info(
|
||||
model_source, self.uploader.get_username(), str(self.work_dir)
|
||||
)
|
||||
|
||||
f16_model_path = self.model_manager.prepare_model(model_source)
|
||||
|
||||
output_repo = (
|
||||
f"{self.uploader.get_username()}/"
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Checking for importance matrix (imatrix)...")
|
||||
model_dir = self.workspace_manager.get_model_dir(model_source.model_name)
|
||||
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
|
||||
|
||||
# If no imatrix found, offer to generate or provide one
|
||||
if not imatrix_path:
|
||||
# First offer to generate
|
||||
imatrix_path = self.imatrix_generator.prompt_for_generation(
|
||||
model_source, model_dir, f16_model_path
|
||||
)
|
||||
|
||||
# If generation was skipped, offer to provide existing one
|
||||
if not imatrix_path:
|
||||
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
|
||||
|
||||
return model_source, f16_model_path, imatrix_path, output_repo
|
||||
|
||||
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
||||
"""Create initial repository with planned quantisations."""
|
||||
logger.info("Creating initial README with planned quantisations...")
|
||||
quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles)
|
||||
planned_results = {
|
||||
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
||||
for qt in quantisation_types
|
||||
}
|
||||
readme_path = self.readme_generator.generate(
|
||||
model_source, planned_results, self.workspace_manager.models_dir, output_repo
|
||||
)
|
||||
|
||||
if not self.no_upload:
|
||||
logger.info("Creating repository with planned quantisations...")
|
||||
self.uploader.upload_readme(output_repo, readme_path)
|
||||
else:
|
||||
logger.info("Skipping repository creation (--no-upload specified)")
|
132
helpers/quantisation/profile_manager.py
Normal file
132
helpers/quantisation/profile_manager.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
"""Quantisation profile management.
|
||||
|
||||
Manages selection and validation of quantisation types based on
|
||||
user preferences, architecture support, and configuration.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.config.quantisation_configs import (
|
||||
DEFAULT_QUANTISATION_TYPES,
|
||||
SUPPORTED_QUANTISATION_TYPES,
|
||||
)
|
||||
from helpers.llama_cpp.architecture import ArchitectureDetector
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ProfileManager:
|
||||
"""Manages quantisation profiles and type selection.
|
||||
|
||||
Handles selection of quantisation types based on custom profiles,
|
||||
architecture support, and fallback to defaults.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_quantisation_types(
|
||||
custom_profiles: list[str] | None = None,
|
||||
) -> list[QuantisationType]:
|
||||
"""Get the quantisation types to use for this run.
|
||||
|
||||
Determines which quantisation types should be processed based on
|
||||
custom profiles provided by the user, or falls back to default
|
||||
configurations if no custom profiles are specified.
|
||||
|
||||
Returns:
|
||||
List of QuantisationType enums to process.
|
||||
"""
|
||||
if custom_profiles:
|
||||
return ProfileManager._parse_custom_profiles(custom_profiles)
|
||||
return DEFAULT_QUANTISATION_TYPES
|
||||
|
||||
@staticmethod
|
||||
def _parse_custom_profiles(profile_strings: list[str]) -> list[QuantisationType]:
|
||||
"""Parse custom profile strings to QuantisationType enums.
|
||||
|
||||
Validates and converts user-provided profile strings into proper
|
||||
QuantisationType enumerations, filtering out invalid or unsupported
|
||||
types whilst logging warnings for problematic entries.
|
||||
|
||||
Returns:
|
||||
List of valid QuantisationType enums.
|
||||
"""
|
||||
result = []
|
||||
for profile_str in profile_strings:
|
||||
try:
|
||||
profile = QuantisationType(profile_str.upper())
|
||||
if profile in SUPPORTED_QUANTISATION_TYPES:
|
||||
result.append(profile)
|
||||
else:
|
||||
logger.warning(f"Profile {profile_str} is not supported, skipping")
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid profile {profile_str}, skipping")
|
||||
|
||||
# Fall back to defaults if no valid profiles
|
||||
return result or DEFAULT_QUANTISATION_TYPES
|
||||
|
||||
@staticmethod
|
||||
def filter_by_architecture(
|
||||
quantisation_types: list[QuantisationType],
|
||||
f16_model_path: Path,
|
||||
) -> tuple[list[QuantisationType], list[QuantisationType]]:
|
||||
"""Filter quantisation types based on architecture support.
|
||||
|
||||
Analyses the F16 GGUF model to determine architecture compatibility
|
||||
and filters the requested quantisation types accordingly. Separates
|
||||
supported types from unsupported ones, especially filtering K-quants
|
||||
for architectures not supported by llama.cpp.
|
||||
|
||||
Returns:
|
||||
Tuple of (supported_types, unsupported_types).
|
||||
"""
|
||||
if not ArchitectureDetector.check_architecture_support(f16_model_path):
|
||||
# Architecture not supported - filter out K-quants
|
||||
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
||||
supported = []
|
||||
unsupported = []
|
||||
|
||||
for quant_type in quantisation_types:
|
||||
if quant_type.value in basic_types:
|
||||
supported.append(quant_type)
|
||||
else:
|
||||
unsupported.append(quant_type)
|
||||
|
||||
if unsupported:
|
||||
logger.warning(
|
||||
"⚠️ Architecture not supported by llama.cpp - K-quants will be skipped"
|
||||
)
|
||||
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
|
||||
|
||||
return supported, unsupported
|
||||
|
||||
# All types supported
|
||||
return quantisation_types, []
|
||||
|
||||
@staticmethod
|
||||
def validate_profiles(profiles: list[str]) -> list[str]:
|
||||
"""Validate a list of profile strings.
|
||||
|
||||
Checks each profile string to ensure it corresponds to a valid
|
||||
and supported quantisation type, logging warnings for invalid
|
||||
entries whilst returning only the valid profile strings.
|
||||
|
||||
Returns:
|
||||
List of valid profile strings.
|
||||
"""
|
||||
valid = []
|
||||
for profile in profiles:
|
||||
try:
|
||||
quant_type = QuantisationType(profile.upper())
|
||||
if quant_type in SUPPORTED_QUANTISATION_TYPES:
|
||||
valid.append(profile)
|
||||
else:
|
||||
logger.warning(f"Profile {profile} exists but is not supported")
|
||||
except ValueError:
|
||||
logger.warning(f"Profile {profile} is not a valid quantisation type")
|
||||
|
||||
return valid
|
151
helpers/quantisation/progress.py
Normal file
151
helpers/quantisation/progress.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
"""Progress tracking and reporting for quantisation workflow.
|
||||
|
||||
Provides utilities for tracking quantisation progress, generating
|
||||
status reports, and displaying completion summaries.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult, QuantisationType
|
||||
|
||||
|
||||
class ProgressReporter:
|
||||
"""Reports progress and status of quantisation operations.
|
||||
|
||||
Provides methods for displaying model information, progress updates,
|
||||
and completion summaries throughout the quantisation workflow.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def print_model_info(model_source: ModelSource, username: str, work_dir: str) -> None:
|
||||
"""Print model information at start of processing.
|
||||
|
||||
Displays comprehensive information about the model being processed,
|
||||
including source details, author information, and working directory
|
||||
to provide clear context at the beginning of quantisation workflows.
|
||||
"""
|
||||
logger.info(f"Source URL: {model_source.url}")
|
||||
logger.info(f"Source model: {model_source.source_model}")
|
||||
logger.info(f"Original author: {model_source.original_author}")
|
||||
logger.info(f"Model name: {model_source.model_name}")
|
||||
logger.info(f"Your HF username: {username}")
|
||||
logger.info(f"Working directory: {work_dir}")
|
||||
|
||||
@staticmethod
|
||||
def print_quantisation_start(
|
||||
index: int,
|
||||
total: int,
|
||||
quant_type: str,
|
||||
) -> None:
|
||||
"""Print message when starting a quantisation.
|
||||
|
||||
Displays progress information showing which quantisation is currently
|
||||
being processed within the overall batch, providing clear feedback
|
||||
about workflow advancement and the specific type being quantised.
|
||||
"""
|
||||
logger.info(f"Processing quantisation {index}/{total}: {quant_type}")
|
||||
|
||||
@staticmethod
|
||||
def print_completion_summary(
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Print completion summary with results.
|
||||
|
||||
Generates comprehensive completion report showing successful quantisations,
|
||||
file information, and repository links. Provides detailed feedback on
|
||||
the overall quantisation workflow outcome and model availability.
|
||||
"""
|
||||
successful_results = [r for r in results.values() if r.success]
|
||||
|
||||
if successful_results:
|
||||
logger.info("Complete! Your quantised models are available at:")
|
||||
logger.info(f" https://huggingface.co/{output_repo}")
|
||||
logger.info("Model info:")
|
||||
logger.info(f" - Source URL: {model_source.url}")
|
||||
logger.info(f" - Original: {model_source.source_model}")
|
||||
logger.info(
|
||||
" - Method: "
|
||||
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
||||
)
|
||||
logger.info(f" - Quantised: {output_repo}")
|
||||
|
||||
for result in successful_results:
|
||||
if result.file_size:
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-"
|
||||
f"{result.quantisation_type}.gguf"
|
||||
)
|
||||
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
||||
else:
|
||||
logger.error(
|
||||
"All quantisations failed - repository created with documentation "
|
||||
"but no model files"
|
||||
)
|
||||
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|
||||
|
||||
@staticmethod
|
||||
def print_upload_summary(completed: int, failed: int) -> None:
|
||||
"""Print upload completion summary.
|
||||
|
||||
Reports the final upload statistics showing successful and failed
|
||||
uploads with appropriate warning or success messaging based on
|
||||
the outcome of the upload batch process.
|
||||
"""
|
||||
if failed > 0:
|
||||
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
|
||||
else:
|
||||
logger.info(f"All {completed} uploads completed successfully")
|
||||
|
||||
@staticmethod
|
||||
def print_architecture_warning() -> None:
|
||||
"""Print warning about unsupported architecture."""
|
||||
logger.warning("⚠️ Architecture not supported by llama.cpp - K-quants will be skipped")
|
||||
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
|
||||
|
||||
@staticmethod
|
||||
def get_status_emoji(status: str) -> str:
|
||||
"""Get emoji for a given status.
|
||||
|
||||
Maps status strings to appropriate emoji representations for enhanced
|
||||
visual feedback in progress reporting. Provides a default emoji for
|
||||
unknown status values to maintain consistent display formatting.
|
||||
|
||||
Returns:
|
||||
Appropriate emoji for the status.
|
||||
"""
|
||||
status_emojis = {
|
||||
"planned": "📋",
|
||||
"processing": "⚙️",
|
||||
"uploading": "📤",
|
||||
"completed": "✅",
|
||||
"failed": "❌",
|
||||
}
|
||||
return status_emojis.get(status, "❓")
|
||||
|
||||
@staticmethod
|
||||
def format_progress_bar(current: int, total: int, width: int = 30) -> str:
|
||||
"""Format a text progress bar.
|
||||
|
||||
Creates a visual progress representation using Unicode block characters
|
||||
with percentage display. Handles edge cases like zero totals and
|
||||
calculates appropriate fill ratios for the specified width.
|
||||
|
||||
Returns:
|
||||
Formatted progress bar string.
|
||||
"""
|
||||
if total == 0:
|
||||
return "[" + " " * width + "]"
|
||||
|
||||
progress = int((current / total) * width)
|
||||
filled = "█" * progress
|
||||
empty = "░" * (width - progress)
|
||||
percentage = (current / total) * 100
|
||||
|
||||
return f"[{filled}{empty}] {percentage:.1f}%"
|
23
helpers/readme/__init__.py
Normal file
23
helpers/readme/__init__.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
"""README generation for quantised models.
|
||||
|
||||
Provides utilities for generating comprehensive documentation including
|
||||
model cards, quantisation tables, and status tracking.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.readme.formatter import (
|
||||
FileSizeFormatter,
|
||||
StatusFormatter,
|
||||
TableFormatter,
|
||||
TagFormatter,
|
||||
)
|
||||
from helpers.readme.generator import ReadmeGenerator
|
||||
|
||||
__all__ = [
|
||||
"FileSizeFormatter",
|
||||
"ReadmeGenerator",
|
||||
"StatusFormatter",
|
||||
"TableFormatter",
|
||||
"TagFormatter",
|
||||
]
|
265
helpers/readme/formatter.py
Normal file
265
helpers/readme/formatter.py
Normal file
|
@ -0,0 +1,265 @@
|
|||
"""README formatting utilities.
|
||||
|
||||
Provides formatters for status indicators, tables, and other README elements.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
|
||||
from helpers.models.quantisation import QuantisationResult, QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import ModelSource
|
||||
|
||||
# File size constant
|
||||
GIBIBYTE = 1024**3
|
||||
|
||||
|
||||
class StatusFormatter:
|
||||
"""Formats status indicators for README tables."""
|
||||
|
||||
@staticmethod
|
||||
def format_status(
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format status indicator for README table.
|
||||
|
||||
Creates appropriate status indicator based on quantisation state
|
||||
including progress indicators, file sizes, and download links.
|
||||
|
||||
Returns:
|
||||
Formatted status string for table cell.
|
||||
"""
|
||||
status_map = {
|
||||
"planned": "⏳ Queued",
|
||||
"processing": "🔄 Processing...",
|
||||
"uploading": "⬆️ Uploading...",
|
||||
"failed": "❌ Failed",
|
||||
}
|
||||
|
||||
if hasattr(result, "status") and result.status in status_map:
|
||||
base_status = status_map[result.status]
|
||||
|
||||
# Check for architecture not supported error
|
||||
if (
|
||||
result.status == "failed"
|
||||
and hasattr(result, "error_message")
|
||||
and result.error_message
|
||||
and "architecture not supported" in str(result.error_message).lower()
|
||||
):
|
||||
return "⚠️ Skipped"
|
||||
|
||||
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
|
||||
return f"{base_status} ({result.file_size})"
|
||||
|
||||
if result.status == "completed" or (hasattr(result, "success") and result.success):
|
||||
return StatusFormatter.format_success_status(
|
||||
result, model_source, quant_type, output_repo
|
||||
)
|
||||
|
||||
return base_status
|
||||
|
||||
# Legacy support
|
||||
if hasattr(result, "success") and result.success:
|
||||
return StatusFormatter.format_success_status(
|
||||
result, model_source, quant_type, output_repo
|
||||
)
|
||||
|
||||
return "❌ Failed"
|
||||
|
||||
@staticmethod
|
||||
def format_success_status(
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format successful quantisation status with download link.
|
||||
|
||||
Creates a download link if repository information is available,
|
||||
otherwise shows file size.
|
||||
|
||||
Returns:
|
||||
Formatted success status string.
|
||||
"""
|
||||
if not output_repo:
|
||||
return (
|
||||
f"✅ {result.file_size}"
|
||||
if hasattr(result, "file_size") and result.file_size
|
||||
else "✅ Available"
|
||||
)
|
||||
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
|
||||
)
|
||||
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
|
||||
|
||||
if hasattr(result, "file_size") and result.file_size:
|
||||
return f"[✅ {result.file_size}]({url})"
|
||||
|
||||
return f"[✅ Available]({url})"
|
||||
|
||||
|
||||
class TableFormatter:
|
||||
"""Formats quantisation tables for README."""
|
||||
|
||||
@staticmethod
|
||||
def get_ordered_quantisation_types() -> list[QuantisationType]:
|
||||
"""Get quantisation types in display order.
|
||||
|
||||
Returns types ordered by precision level and variant.
|
||||
|
||||
Returns:
|
||||
Ordered list of quantisation types.
|
||||
"""
|
||||
return [
|
||||
# Q3 K-quants
|
||||
QuantisationType.Q3_K_M,
|
||||
QuantisationType.Q3_K_L,
|
||||
QuantisationType.Q3_K_XL,
|
||||
# Q4 types
|
||||
QuantisationType.Q4_0, # Basic
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
# Q5 types
|
||||
QuantisationType.Q5_0, # Basic
|
||||
QuantisationType.Q5_K_M,
|
||||
QuantisationType.Q5_K_L,
|
||||
# Q6 types
|
||||
QuantisationType.Q6_0, # Basic
|
||||
QuantisationType.Q6_K,
|
||||
QuantisationType.Q6_K_L,
|
||||
# Q8 types
|
||||
QuantisationType.Q8_0, # Basic
|
||||
QuantisationType.Q8_K,
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def format_quantisation_row(
|
||||
quant_type: QuantisationType,
|
||||
result: QuantisationResult | None,
|
||||
model_source: ModelSource,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format a single quantisation table row.
|
||||
|
||||
Creates a formatted table row for the README displaying quantisation
|
||||
type, configuration details, and status information. Handles cases
|
||||
where no result is available by creating a default planned result.
|
||||
|
||||
Returns:
|
||||
Formatted table row string.
|
||||
"""
|
||||
# Create default result if none exists
|
||||
if result is None:
|
||||
result = QuantisationResult(
|
||||
quantisation_type=quant_type, success=False, status="planned"
|
||||
)
|
||||
|
||||
# Get configuration
|
||||
config = QUANTISATION_CONFIGS.get(quant_type)
|
||||
|
||||
# Format status
|
||||
status_formatter = StatusFormatter()
|
||||
status = status_formatter.format_status(result, model_source, quant_type, output_repo)
|
||||
|
||||
# Get configuration description
|
||||
config_desc = (
|
||||
config.get_compact_config(QUANTISATION_CONFIGS)
|
||||
if config
|
||||
else f"{quant_type} all layers"
|
||||
)
|
||||
|
||||
return f"| **{quant_type.value}** | {config_desc} | {status} |\n"
|
||||
|
||||
|
||||
class TagFormatter:
|
||||
"""Formats tags for README frontmatter."""
|
||||
|
||||
@staticmethod
|
||||
def build_tags(
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_tags: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""Build tags based on quantisation results.
|
||||
|
||||
Generates appropriate tags for the model repository based on
|
||||
successful quantisations and combines them with any original
|
||||
tags from the source model to create a comprehensive tag list.
|
||||
|
||||
Returns:
|
||||
Sorted list of unique tags.
|
||||
"""
|
||||
our_tags = ["gguf"]
|
||||
|
||||
# Add tags for successful quantisations
|
||||
for quant_type, result in results.items():
|
||||
if hasattr(result, "status") and result.status == "completed":
|
||||
if quant_type == QuantisationType.F16:
|
||||
our_tags.append("f16")
|
||||
elif hasattr(result, "quantisation_type"):
|
||||
# Convert to lowercase tag format
|
||||
our_tags.append(result.quantisation_type.value.lower())
|
||||
|
||||
# Check for F16 availability
|
||||
if (
|
||||
len(our_tags) == 1
|
||||
and QuantisationType.F16 in results
|
||||
and hasattr(results[QuantisationType.F16], "status")
|
||||
and results[QuantisationType.F16].status in {"completed", "uploading"}
|
||||
):
|
||||
our_tags.append("f16")
|
||||
|
||||
# Combine with original tags
|
||||
all_tags = our_tags
|
||||
if original_tags:
|
||||
all_tags = sorted(set(our_tags + original_tags))
|
||||
|
||||
return all_tags
|
||||
|
||||
|
||||
class FileSizeFormatter:
|
||||
"""Formats file sizes for display."""
|
||||
|
||||
@staticmethod
|
||||
def format_size_bytes(size_bytes: int) -> str:
|
||||
"""Format bytes to human-readable size.
|
||||
|
||||
Converts raw byte values into human-readable format using appropriate
|
||||
units (B, KB, MB, GB) with decimal precision for larger values to
|
||||
provide clear file size information in documentation.
|
||||
|
||||
Returns:
|
||||
Formatted size string (e.g., "4.5GB").
|
||||
"""
|
||||
if size_bytes < 1024:
|
||||
return f"{size_bytes}B"
|
||||
if size_bytes < 1024**2:
|
||||
return f"{size_bytes / 1024:.1f}KB"
|
||||
if size_bytes < GIBIBYTE:
|
||||
return f"{size_bytes / (1024**2):.1f}MB"
|
||||
return f"{size_bytes / GIBIBYTE:.1f}GB"
|
||||
|
||||
@staticmethod
|
||||
def get_file_size(file_path: Path) -> str:
|
||||
"""Get formatted file size from path.
|
||||
|
||||
Retrieves file size information from the filesystem and formats
|
||||
it into human-readable format. Handles non-existent files gracefully
|
||||
by returning a placeholder string for missing files.
|
||||
|
||||
Returns:
|
||||
Formatted size string or "-" if file doesn't exist.
|
||||
"""
|
||||
if not file_path.exists():
|
||||
return "-"
|
||||
|
||||
size_bytes = file_path.stat().st_size
|
||||
return FileSizeFormatter.format_size_bytes(size_bytes)
|
311
helpers/readme/generator.py
Normal file
311
helpers/readme/generator.py
Normal file
|
@ -0,0 +1,311 @@
|
|||
"""README generation for quantised models.
|
||||
|
||||
Coordinates README creation by combining templates, formatting, and
|
||||
original model information.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
from helpers.readme.formatter import (
|
||||
FileSizeFormatter,
|
||||
TableFormatter,
|
||||
TagFormatter,
|
||||
)
|
||||
from helpers.readme.templates import (
|
||||
get_f16_row_template,
|
||||
get_frontmatter_template,
|
||||
get_header_template,
|
||||
get_original_model_section,
|
||||
get_quantisation_info,
|
||||
)
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult
|
||||
|
||||
# File size constant
|
||||
GIBIBYTE = 1024**3
|
||||
|
||||
|
||||
class ReadmeGenerator:
|
||||
"""Generates README files for quantised models.
|
||||
|
||||
Creates comprehensive README documentation including model cards,
|
||||
quantisation details, and status tracking. Supports both initial
|
||||
planning documentation and final result summaries.
|
||||
"""
|
||||
|
||||
def generate(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate README file for quantised model repository.
|
||||
|
||||
Creates a comprehensive README with frontmatter, quantisation table,
|
||||
and original model information. Handles status tracking for planned,
|
||||
processing, and completed quantisations.
|
||||
|
||||
Returns:
|
||||
Path to generated README file.
|
||||
"""
|
||||
logger.info("Creating model card...")
|
||||
model_dir = models_dir / model_source.model_name
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
# Get original README content
|
||||
original_content = self._get_original_readme(model_source, model_dir)
|
||||
|
||||
# Generate new README
|
||||
readme_content = self._generate_readme_content(
|
||||
model_source, results, original_content, output_repo, models_dir
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
return readme_path
|
||||
|
||||
def _get_architecture(self, model_dir: Path) -> str | None:
|
||||
"""Get the architecture from the model's config.json.
|
||||
|
||||
Returns:
|
||||
Architecture name or None if not found.
|
||||
"""
|
||||
config_path = model_dir / "config.json"
|
||||
if not config_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with config_path.open(encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Get the architectures field - it's a list
|
||||
architectures = config.get("architectures", [])
|
||||
if architectures:
|
||||
arch_name = architectures[0]
|
||||
# Get the mapped architecture (what it will be converted to)
|
||||
parser = ConfigParser()
|
||||
mapped_arch = parser.get_architecture_mapping(arch_name)
|
||||
logger.info(f"Architecture: {arch_name} -> {mapped_arch}")
|
||||
return mapped_arch
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not determine architecture: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
|
||||
"""Extract original README and metadata.
|
||||
|
||||
Downloads or reads the original model's README for inclusion in the
|
||||
quantised model documentation. Parses YAML frontmatter if present.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content, licence, tags, and frontmatter.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
# Check for preserved original README first
|
||||
original_readme_path = model_dir / "README.original.md"
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
if original_readme_path.exists():
|
||||
# Use the preserved original
|
||||
content["readme"] = original_readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Found preserved original README ({len(content['readme'])} characters)")
|
||||
elif readme_path.exists():
|
||||
# First time - preserve the original and use it
|
||||
readme_content = readme_path.read_text(encoding="utf-8")
|
||||
|
||||
# Check if this is already our generated README
|
||||
if (
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
not in readme_content
|
||||
):
|
||||
# This is the original - preserve it
|
||||
original_readme_path.write_text(readme_content)
|
||||
content["readme"] = readme_content
|
||||
logger.info(f"Preserved original README ({len(readme_content)} characters)")
|
||||
else:
|
||||
# This is our README, try to extract original content
|
||||
logger.info("Found existing generated README, extracting original content")
|
||||
# Try to find the separator
|
||||
separator_idx = readme_content.find("\n---\n\n## Original Model Information\n")
|
||||
if separator_idx > 0:
|
||||
content["readme"] = readme_content[separator_idx + 37 :]
|
||||
else:
|
||||
logger.info("No README found to preserve")
|
||||
|
||||
# Parse frontmatter if we have content
|
||||
if content["readme"]:
|
||||
parsed = self._parse_frontmatter(content["readme"])
|
||||
content.update(parsed)
|
||||
|
||||
return content
|
||||
|
||||
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
|
||||
"""Parse YAML frontmatter from README.
|
||||
|
||||
Extracts metadata from YAML frontmatter including licence, tags,
|
||||
and other model card fields.
|
||||
|
||||
Returns:
|
||||
Dictionary with separated content and metadata.
|
||||
"""
|
||||
lines = readme_text.split("\n")
|
||||
if lines[0] != "---":
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter_end = -1
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
if line == "---":
|
||||
frontmatter_end = i
|
||||
break
|
||||
|
||||
if frontmatter_end == -1:
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter = "\n".join(lines[1:frontmatter_end])
|
||||
content = "\n".join(lines[frontmatter_end + 1 :])
|
||||
|
||||
# Extract licence
|
||||
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
|
||||
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
|
||||
|
||||
# Extract tags
|
||||
tags = []
|
||||
in_tags = False
|
||||
for line in frontmatter.split("\n"):
|
||||
if line.startswith("tags:"):
|
||||
in_tags = True
|
||||
continue
|
||||
if in_tags:
|
||||
if line.startswith("- "):
|
||||
tags.append(line[2:].strip())
|
||||
elif line and not line.startswith(" "):
|
||||
break
|
||||
|
||||
return {
|
||||
"readme": content,
|
||||
"licence": licence_val,
|
||||
"tags": ",".join(tags),
|
||||
"frontmatter": frontmatter,
|
||||
}
|
||||
|
||||
def _generate_readme_content(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_content: dict[str, str],
|
||||
output_repo: str | None = None,
|
||||
models_dir: Path | None = None,
|
||||
) -> str:
|
||||
"""Generate complete README content with quantisation details.
|
||||
|
||||
Creates the full README including YAML frontmatter, quantisation status
|
||||
table, and original model information.
|
||||
|
||||
Returns:
|
||||
Complete README markdown content.
|
||||
"""
|
||||
# Build tags
|
||||
tag_formatter = TagFormatter()
|
||||
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
|
||||
all_tags = tag_formatter.build_tags(results, original_tags)
|
||||
|
||||
# Build frontmatter
|
||||
content = get_frontmatter_template(
|
||||
original_content["licence"],
|
||||
model_source.source_model,
|
||||
all_tags,
|
||||
)
|
||||
|
||||
# Add header
|
||||
content += get_header_template(
|
||||
model_source.original_author,
|
||||
model_source.model_name,
|
||||
model_source.source_model,
|
||||
)
|
||||
|
||||
# Add quantisation table
|
||||
table_formatter = TableFormatter()
|
||||
for quant_type in table_formatter.get_ordered_quantisation_types():
|
||||
result = results.get(quant_type)
|
||||
content += table_formatter.format_quantisation_row(
|
||||
quant_type, result, model_source, output_repo
|
||||
)
|
||||
|
||||
# Add F16 row if applicable
|
||||
if not model_source.is_gguf_repo and output_repo:
|
||||
content += self._format_f16_row(model_source, results, output_repo, models_dir)
|
||||
|
||||
# Add quantisation information
|
||||
content += get_quantisation_info()
|
||||
|
||||
# Add original model section if available
|
||||
if original_content.get("readme"):
|
||||
content += get_original_model_section(original_content["readme"])
|
||||
|
||||
return content
|
||||
|
||||
def _format_f16_row(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
models_dir: Path | None = None,
|
||||
) -> str:
|
||||
"""Format F16 GGUF row for the table.
|
||||
|
||||
Creates a properly formatted F16 reference row for the quantisation
|
||||
table using source model information, results data, and repository
|
||||
details with optional models directory for file size calculation.
|
||||
|
||||
Returns:
|
||||
Formatted F16 table row.
|
||||
"""
|
||||
# Get F16 result from results dict
|
||||
f16_result = results.get(QuantisationType.F16)
|
||||
|
||||
# Get file size
|
||||
f16_size = "-"
|
||||
if f16_result and hasattr(f16_result, "file_size"):
|
||||
f16_size = f16_result.file_size or "-"
|
||||
elif models_dir:
|
||||
# Try to get from actual file
|
||||
f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
f16_path = models_dir / model_source.model_name / f16_filename
|
||||
if f16_path.exists():
|
||||
f16_size = FileSizeFormatter.get_file_size(f16_path)
|
||||
|
||||
# Get status
|
||||
status = "planned"
|
||||
if f16_result and hasattr(f16_result, "status"):
|
||||
status = f16_result.status
|
||||
|
||||
return get_f16_row_template(
|
||||
model_source.original_author,
|
||||
model_source.model_name,
|
||||
output_repo,
|
||||
f16_size,
|
||||
status,
|
||||
)
|
228
helpers/readme/templates.py
Normal file
228
helpers/readme/templates.py
Normal file
|
@ -0,0 +1,228 @@
|
|||
"""README templates for quantised models.
|
||||
|
||||
Provides template strings and builders for generating README documentation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def get_frontmatter_template(
|
||||
licence: str,
|
||||
base_model: str,
|
||||
tags: list[str],
|
||||
) -> str:
|
||||
"""Generate YAML frontmatter for README.
|
||||
|
||||
Creates the YAML metadata header for HuggingFace model cards including
|
||||
licence information, library specification, base model reference, and
|
||||
tag listings formatted according to HuggingFace conventions.
|
||||
|
||||
Returns:
|
||||
Formatted YAML frontmatter string.
|
||||
"""
|
||||
frontmatter = f"""---
|
||||
license: {licence}
|
||||
library_name: gguf
|
||||
base_model: {base_model}
|
||||
tags:
|
||||
"""
|
||||
for tag in tags:
|
||||
if tag.strip():
|
||||
frontmatter += f"- {tag.strip()}\n"
|
||||
|
||||
frontmatter += "---\n\n"
|
||||
return frontmatter
|
||||
|
||||
|
||||
def get_header_template(
|
||||
original_author: str,
|
||||
model_name: str,
|
||||
source_model: str,
|
||||
) -> str:
|
||||
"""Generate README header section.
|
||||
|
||||
Creates the main header section with model title, description of the
|
||||
quantisation process, and initial table structure for displaying
|
||||
quantisation variants and their status information.
|
||||
|
||||
Returns:
|
||||
Formatted header markdown.
|
||||
"""
|
||||
hf_url = f"https://huggingface.co/{source_model}"
|
||||
return f"""# {original_author}-{model_name}-GGUF
|
||||
|
||||
GGUF quantisations of [{source_model}]({hf_url}) using
|
||||
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
|
||||
which replicates Bartowski's quantisation profiles.
|
||||
|
||||
| Variant | Configuration | Status |
|
||||
|---|---|---|
|
||||
"""
|
||||
|
||||
|
||||
def get_downloads_section(download_instruction: str | None = None) -> str:
|
||||
"""Generate downloads and usage section.
|
||||
|
||||
Creates comprehensive usage documentation including download instructions,
|
||||
quick start examples for various runtimes (llama.cpp, Ollama, LM Studio),
|
||||
and integration guidance with optional custom instructions.
|
||||
|
||||
Returns:
|
||||
Formatted downloads section markdown.
|
||||
"""
|
||||
base_section = """
|
||||
## 📥 Download Links
|
||||
|
||||
Direct download links are available for each quantisation in the table above. Click the ✅ status to
|
||||
go to the file page.
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
### Using llama.cpp
|
||||
|
||||
```bash
|
||||
# Download the model (replace Q4_K_M with your chosen quantisation)
|
||||
wget https://huggingface.co/YOUR_REPO/resolve/main/model-Q4_K_M.gguf
|
||||
|
||||
# Run with llama.cpp
|
||||
./llama-cli -m model-Q4_K_M.gguf -p "Your prompt here"
|
||||
```
|
||||
|
||||
### Using Ollama
|
||||
|
||||
```bash
|
||||
# Create Modelfile
|
||||
echo "FROM ./model-Q4_K_M.gguf" > Modelfile
|
||||
|
||||
# Create and run the model
|
||||
ollama create mymodel -f Modelfile
|
||||
ollama run mymodel
|
||||
```
|
||||
|
||||
### Using LM Studio
|
||||
|
||||
1. Open LM Studio
|
||||
2. Click "Download Model"
|
||||
3. Paste the HuggingFace repository URL
|
||||
4. Select your preferred quantisation
|
||||
5. Click Download
|
||||
|
||||
"""
|
||||
if download_instruction:
|
||||
base_section = f"{download_instruction}\n\n{base_section}"
|
||||
|
||||
return base_section
|
||||
|
||||
|
||||
def get_quantisation_info() -> str:
|
||||
"""Get information about quantisation types.
|
||||
|
||||
Returns:
|
||||
Formatted quantisation information markdown.
|
||||
"""
|
||||
return """
|
||||
## 📊 Quantisation Information
|
||||
|
||||
### Bartowski Naming Convention
|
||||
|
||||
- **L variants** (Q3_K_L, Q4_K_L, Q5_K_L): Uses Q8_0 for embeddings/output weights
|
||||
- **M variants** (Q3_K_M, Q4_K_M, Q5_K_M): Standard K-quant configuration
|
||||
- **XL variant** (Q3_K_XL): Q8_0 embeddings + Q6_K output weights
|
||||
- **_L suffix** (Q6_K_L): Q8_0 for output.weight tensor
|
||||
|
||||
### Recommended Quantisations
|
||||
|
||||
- **Q4_K_M**: Best balance of quality and size (4.58GB for 7B model)
|
||||
- **Q5_K_M**: Higher quality, larger size (5.33GB for 7B model)
|
||||
- **Q3_K_L**: Smallest with good quality (3.35GB for 7B model)
|
||||
- **Q6_K_L**: Near original quality (5.65GB for 7B model)
|
||||
- **Q8_0**: Highest quality quantisation (7.17GB for 7B model)
|
||||
|
||||
### Basic vs K-quants
|
||||
|
||||
- **Basic types** (Q4_0, Q5_0, Q6_0, Q8_0): Simple quantisation, universally compatible
|
||||
- **K-quants** (Q#_K_*): Advanced quantisation with better quality/size ratios
|
||||
|
||||
Choose K-quants when available for better performance. Basic types are fallbacks for unsupported
|
||||
architectures.
|
||||
"""
|
||||
|
||||
|
||||
def get_original_model_section(
|
||||
original_readme: str,
|
||||
separator: str = "---",
|
||||
) -> str:
|
||||
"""Format original model documentation section.
|
||||
|
||||
Formats the original model's documentation for inclusion in the
|
||||
quantised model's README, preserving important context whilst
|
||||
clearly separating it from the quantisation-specific information.
|
||||
|
||||
Returns:
|
||||
Formatted original model section.
|
||||
"""
|
||||
if not original_readme:
|
||||
return ""
|
||||
|
||||
return f"""
|
||||
{separator}
|
||||
|
||||
## Original Model Information
|
||||
|
||||
{original_readme}
|
||||
"""
|
||||
|
||||
|
||||
def get_f16_row_template(
|
||||
original_author: str,
|
||||
model_name: str,
|
||||
output_repo: str,
|
||||
file_size: str = "-",
|
||||
status: str = "completed",
|
||||
) -> str:
|
||||
"""Generate F16 GGUF row for the table.
|
||||
|
||||
Creates a formatted table row for the F16 reference model with
|
||||
appropriate status indicators, download links, and file size
|
||||
information based on upload status and availability.
|
||||
|
||||
Returns:
|
||||
Formatted table row for F16.
|
||||
"""
|
||||
filename = f"{original_author}-{model_name}-f16.gguf"
|
||||
url = f"https://huggingface.co/{output_repo}/blob/main/{filename}"
|
||||
|
||||
if status == "uploading":
|
||||
status_text = f"⬆️ Uploading... ({file_size})"
|
||||
elif status == "completed":
|
||||
status_text = f"[✅ {file_size}]({url})"
|
||||
else:
|
||||
status_text = "⏳ Queued"
|
||||
|
||||
return f"| **F16** | Full precision reference | {status_text} |\n"
|
||||
|
||||
|
||||
def get_troubleshooting_section() -> str:
|
||||
"""Get troubleshooting section for README.
|
||||
|
||||
Returns:
|
||||
Formatted troubleshooting markdown.
|
||||
"""
|
||||
return """
|
||||
## 🔧 Troubleshooting
|
||||
|
||||
### File Not Found
|
||||
- Ensure you're using the correct repository URL
|
||||
- Check that the quantisation has completed (✅ status)
|
||||
- Try refreshing the page if recently uploaded
|
||||
|
||||
### Performance Issues
|
||||
- Use smaller quantisations for limited RAM/VRAM
|
||||
- Q4_K_M offers the best balance for most users
|
||||
- Enable GPU acceleration if available
|
||||
|
||||
### Compatibility
|
||||
- K-quants require llama.cpp or compatible runtime
|
||||
- Basic types (Q4_0, Q5_0, etc.) work with all runtimes
|
||||
- Check your runtime's documentation for supported types
|
||||
"""
|
|
@ -1,6 +0,0 @@
|
|||
"""Service layer for llm-gguf-tools.
|
||||
|
||||
Provides high-level service interfaces for interacting with external systems
|
||||
including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
|
@ -1,236 +0,0 @@
|
|||
"""GGUF file operations service.
|
||||
|
||||
Provides unified interface for creating, writing, and manipulating GGUF files.
|
||||
Consolidates GGUF-specific operations from conversion and quantisation workflows.
|
||||
Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
|
||||
import gguf
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
|
||||
|
||||
class VisionConfig(Protocol):
|
||||
"""Protocol for vision model configuration."""
|
||||
|
||||
hidden_size: int
|
||||
num_hidden_layers: int
|
||||
num_attention_heads: int
|
||||
intermediate_size: int
|
||||
patch_size: int
|
||||
spatial_merge_size: int
|
||||
|
||||
|
||||
class TensorMapper(Protocol):
|
||||
"""Protocol for tensor name mapping."""
|
||||
|
||||
def map_tensor_name(self, name: str) -> str | None:
|
||||
"""Map a tensor name to its GGUF equivalent."""
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from helpers.models.conversion import ModelConfig
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
"""Manages GGUF file creation and metadata writing.
|
||||
|
||||
Provides high-level interface for GGUF file operations including metadata
|
||||
configuration, tensor addition, and tokeniser integration. Encapsulates
|
||||
low-level GGUF library interactions for consistent error handling.
|
||||
"""
|
||||
|
||||
def __init__(self, output_path: Path, architecture: str) -> None:
|
||||
"""Initialise GGUF writer with output path and architecture.
|
||||
|
||||
Creates the underlying GGUF writer instance and prepares for metadata
|
||||
and tensor addition. Sets up the file structure for the specified
|
||||
model architecture.
|
||||
"""
|
||||
self.output_path = output_path
|
||||
self.architecture = architecture
|
||||
self.writer = gguf.GGUFWriter(str(output_path), architecture)
|
||||
logger.info(f"Created GGUF writer for {architecture} architecture")
|
||||
|
||||
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
|
||||
"""Add comprehensive metadata from model configuration.
|
||||
|
||||
Writes general model information, architectural parameters, and
|
||||
quantisation settings to the GGUF file header. Handles both standard
|
||||
and vision model configurations with appropriate parameter mapping.
|
||||
"""
|
||||
# General metadata
|
||||
self.writer.add_name(model_name)
|
||||
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
||||
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
||||
|
||||
# Model parameters from config
|
||||
params = model_config.to_gguf_params()
|
||||
self.writer.add_context_length(params.context_length)
|
||||
self.writer.add_embedding_length(params.embedding_length)
|
||||
self.writer.add_block_count(params.block_count)
|
||||
self.writer.add_feed_forward_length(params.feed_forward_length)
|
||||
self.writer.add_head_count(params.attention_head_count)
|
||||
self.writer.add_head_count_kv(params.attention_head_count_kv)
|
||||
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
|
||||
self.writer.add_rope_freq_base(params.rope_freq_base)
|
||||
self.writer.add_rope_dimension_count(params.rope_dimension_count)
|
||||
|
||||
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
|
||||
|
||||
def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
|
||||
"""Add vision model parameters to GGUF metadata.
|
||||
|
||||
Configures vision-specific parameters for multimodal models including
|
||||
embedding dimensions, attention heads, and spatial processing settings.
|
||||
"""
|
||||
if not vision_config:
|
||||
return
|
||||
|
||||
logger.info("Adding vision model parameters...")
|
||||
self.writer.add_vision_embedding_length(vision_config.hidden_size)
|
||||
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
|
||||
self.writer.add_vision_head_count(vision_config.num_attention_heads)
|
||||
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
|
||||
self.writer.add_vision_patch_size(vision_config.patch_size)
|
||||
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
|
||||
|
||||
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
|
||||
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
|
||||
|
||||
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
|
||||
"""Add tokeniser metadata to GGUF file.
|
||||
|
||||
Writes special token IDs and tokeniser model type to enable proper
|
||||
text processing during inference. Uses sensible defaults for missing
|
||||
configuration values.
|
||||
"""
|
||||
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
|
||||
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
||||
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
||||
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
||||
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
|
||||
|
||||
logger.info("Added tokeniser configuration")
|
||||
|
||||
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
||||
"""Add a tensor to the GGUF file.
|
||||
|
||||
Writes tensor data with the specified name to the file. Handles
|
||||
data type conversions and validates tensor shapes.
|
||||
"""
|
||||
self.writer.add_tensor(name, data)
|
||||
|
||||
def finalise(self) -> None:
|
||||
"""Write all data to file and close writer.
|
||||
|
||||
Completes the GGUF file creation by writing headers, key-value data,
|
||||
and tensor data in the correct order. Ensures proper file closure.
|
||||
"""
|
||||
logger.info(f"Writing GGUF file to {self.output_path}")
|
||||
self.writer.write_header_to_file()
|
||||
self.writer.write_kv_data_to_file()
|
||||
self.writer.write_tensors_to_file()
|
||||
self.writer.close()
|
||||
logger.info("GGUF file written successfully")
|
||||
|
||||
|
||||
class GGUFConverter:
|
||||
"""High-level GGUF conversion orchestrator.
|
||||
|
||||
Coordinates the complete conversion workflow from source models to GGUF
|
||||
format, managing metadata extraction, tensor mapping, and file writing.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def convert_safetensors(
|
||||
model_path: Path,
|
||||
output_path: Path,
|
||||
model_config: ModelConfig,
|
||||
architecture: str,
|
||||
tensor_mapper: TensorMapper,
|
||||
) -> bool:
|
||||
"""Convert SafeTensors model to GGUF format.
|
||||
|
||||
Orchestrates the conversion process including metadata setup, tensor
|
||||
loading with BFloat16 support, name mapping, and tokeniser integration.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"Converting {model_path.name} to GGUF...")
|
||||
|
||||
# Create writer
|
||||
writer_wrapper = GGUFWriter(output_path, architecture)
|
||||
|
||||
# Add metadata
|
||||
writer_wrapper.add_metadata(model_config, model_path.name)
|
||||
|
||||
# Add vision metadata if present
|
||||
if model_config.vision_config:
|
||||
writer_wrapper.add_vision_metadata(model_config.vision_config)
|
||||
|
||||
# Load and add tensors
|
||||
fs = FilesystemService()
|
||||
tensor_files = fs.find_safetensor_files(model_path)
|
||||
logger.info(f"Found {len(tensor_files)} tensor file(s)")
|
||||
|
||||
tensor_count = 0
|
||||
for tensor_file in tensor_files:
|
||||
logger.info(f"Loading {tensor_file.name}...")
|
||||
with safe_open(tensor_file, framework="pt") as f:
|
||||
for tensor_name in f.keys(): # noqa: SIM118
|
||||
tensor_data = f.get_tensor(tensor_name)
|
||||
|
||||
# Convert BFloat16 to Float32
|
||||
if hasattr(tensor_data, "numpy"):
|
||||
if torch and tensor_data.dtype == torch.bfloat16:
|
||||
tensor_data = tensor_data.float()
|
||||
tensor_data = tensor_data.numpy()
|
||||
|
||||
# Map tensor name
|
||||
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
|
||||
|
||||
if gguf_name:
|
||||
writer_wrapper.add_tensor(gguf_name, tensor_data)
|
||||
tensor_count += 1
|
||||
|
||||
if tensor_count % 100 == 0:
|
||||
logger.info(f" Processed {tensor_count} tensors...")
|
||||
|
||||
# Free memory after processing each tensor
|
||||
del tensor_data
|
||||
|
||||
# Force garbage collection after processing each file
|
||||
gc.collect()
|
||||
|
||||
logger.info(f"Total tensors processed: {tensor_count}")
|
||||
|
||||
# Add tokeniser
|
||||
try:
|
||||
tok_config = ConfigParser.load_tokeniser_config(model_path)
|
||||
writer_wrapper.add_tokeniser(tok_config)
|
||||
logger.info("Tokeniser added")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not add tokeniser: {e}")
|
||||
|
||||
# Finalise file
|
||||
writer_wrapper.finalise()
|
||||
|
||||
file_size = fs.get_file_size(output_path)
|
||||
logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
|
||||
|
||||
return True
|
|
@ -1,613 +0,0 @@
|
|||
"""HuggingFace operations service.
|
||||
|
||||
Handles all interactions with HuggingFace including model downloads,
|
||||
uploads, README generation, and repository management. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult
|
||||
|
||||
# Constants for file size formatting
|
||||
GIBIBYTE = 1024**3
|
||||
|
||||
|
||||
class HuggingFaceService:
|
||||
"""Manages HuggingFace repository operations.
|
||||
|
||||
Provides methods for downloading models, uploading files, and managing
|
||||
repositories. Handles authentication, error recovery, and progress tracking
|
||||
for robust interaction with HuggingFace services.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Retrieves the current user's HuggingFace username using the CLI.
|
||||
Requires prior authentication via `huggingface-cli login`.
|
||||
|
||||
Returns:
|
||||
HuggingFace username.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated or CLI not available.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
@staticmethod
|
||||
def download_model(
|
||||
model_name: str, output_dir: Path, include_pattern: str | None = None
|
||||
) -> None:
|
||||
"""Download model from HuggingFace.
|
||||
|
||||
Downloads a complete model or specific files matching a pattern.
|
||||
Creates the output directory if it doesn't exist. Supports filtered
|
||||
downloads for efficient bandwidth usage when only certain files are needed.
|
||||
"""
|
||||
logger.info(f"Downloading {model_name} to {output_dir}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_name,
|
||||
"--local-dir",
|
||||
str(output_dir),
|
||||
]
|
||||
|
||||
if include_pattern:
|
||||
cmd.extend(["--include", include_pattern])
|
||||
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
logger.info("Download complete")
|
||||
|
||||
@staticmethod
|
||||
def upload_file(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str | None = None,
|
||||
create_repo: bool = False,
|
||||
) -> None:
|
||||
"""Upload a file to HuggingFace repository.
|
||||
|
||||
Uploads a single file to the specified repository path. Can create
|
||||
the repository if it doesn't exist. Uses git directly when possible
|
||||
to avoid automatic PR creation.
|
||||
|
||||
Raises:
|
||||
CalledProcessError: If upload fails.
|
||||
"""
|
||||
repo_path = repo_path or local_path.name
|
||||
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
|
||||
|
||||
# Try git-based upload first to avoid PR creation
|
||||
if HuggingFaceService._try_git_upload(
|
||||
repo_id, local_path, repo_path, create_repo=create_repo
|
||||
):
|
||||
logger.info(f"Uploaded {repo_path} via git")
|
||||
return
|
||||
|
||||
# Fallback to huggingface-cli
|
||||
logger.info("Git upload failed, trying huggingface-cli...")
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(local_path),
|
||||
repo_path,
|
||||
"--revision",
|
||||
"main", # Explicitly push to main branch
|
||||
"--commit-message",
|
||||
f"Add {repo_path}",
|
||||
]
|
||||
|
||||
if create_repo:
|
||||
cmd.append("--create")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
logger.info(f"Uploaded {repo_path}")
|
||||
except subprocess.CalledProcessError:
|
||||
if create_repo:
|
||||
# Repository might already exist, retry without --create
|
||||
cmd = cmd[:-1] # Remove --create flag
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
logger.info(f"Updated {repo_path}")
|
||||
else:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def _try_git_upload(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str,
|
||||
*,
|
||||
create_repo: bool = False,
|
||||
) -> bool:
|
||||
"""Try to upload file using git directly to avoid PR creation.
|
||||
|
||||
Returns:
|
||||
bool: True if upload successful, False if should fallback to CLI.
|
||||
"""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
repo_url = f"https://huggingface.co/{repo_id}"
|
||||
|
||||
# Clone repository
|
||||
logger.info(f"Cloning {repo_url}...")
|
||||
result = subprocess.run(
|
||||
["git", "clone", repo_url, str(temp_path / "repo")],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
if create_repo:
|
||||
# Repository doesn't exist, let huggingface-cli handle creation
|
||||
return False
|
||||
logger.warning(f"Clone failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
repo_dir = temp_path / "repo"
|
||||
target_file = repo_dir / repo_path
|
||||
|
||||
# Ensure target directory exists
|
||||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy file
|
||||
shutil.copy2(local_path, target_file)
|
||||
|
||||
# Check if there are any changes
|
||||
status_result = subprocess.run(
|
||||
["git", "status", "--porcelain"],
|
||||
cwd=repo_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
if not status_result.stdout.strip():
|
||||
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
|
||||
return True # File is already up-to-date, no need to push
|
||||
|
||||
# Git add, commit, push
|
||||
subprocess.run(
|
||||
["git", "add", repo_path],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", f"Update {repo_path}"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "push"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Git upload failed: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Git upload error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
class ReadmeGenerator:
|
||||
"""Generates README files for quantised models.
|
||||
|
||||
Creates comprehensive README documentation including model cards,
|
||||
quantisation details, and status tracking. Supports both initial
|
||||
planning documentation and final result summaries.
|
||||
"""
|
||||
|
||||
def generate(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate README file for quantised model repository.
|
||||
|
||||
Creates a comprehensive README with frontmatter, quantisation table,
|
||||
and original model information. Handles status tracking for planned,
|
||||
processing, and completed quantisations.
|
||||
|
||||
Returns:
|
||||
Path to generated README file.
|
||||
"""
|
||||
logger.info("Creating model card...")
|
||||
|
||||
model_dir = models_dir / model_source.model_name
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
# Get original README content
|
||||
original_content = self._get_original_readme(model_source, model_dir)
|
||||
|
||||
# Generate new README
|
||||
readme_content = self._generate_readme_content(
|
||||
model_source, results, original_content, output_repo
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
return readme_path
|
||||
|
||||
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
|
||||
"""Extract original README and metadata.
|
||||
|
||||
Downloads or reads the original model's README for inclusion in the
|
||||
quantised model documentation. Parses YAML frontmatter if present.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content, licence, tags, and frontmatter.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
# Check for preserved original README first
|
||||
original_readme_path = model_dir / "README.original.md"
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
if original_readme_path.exists():
|
||||
# Use the preserved original
|
||||
content["readme"] = original_readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Found preserved original README ({len(content['readme'])} characters)")
|
||||
elif readme_path.exists():
|
||||
# First time - preserve the original and use it
|
||||
readme_content = readme_path.read_text(encoding="utf-8")
|
||||
|
||||
# Check if this is already our generated README
|
||||
if (
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
not in readme_content
|
||||
):
|
||||
# This is the original - preserve it
|
||||
original_readme_path.write_text(readme_content, encoding="utf-8")
|
||||
content["readme"] = readme_content
|
||||
readme_len = len(content["readme"])
|
||||
logger.info(
|
||||
f"Preserved original README as README.original.md ({readme_len} characters)"
|
||||
)
|
||||
else:
|
||||
# This is our generated README, need to download the original
|
||||
logger.info("Found generated README, downloading original from source")
|
||||
content = self._download_readme(model_source)
|
||||
# Save the downloaded original for future use
|
||||
if content["readme"]:
|
||||
original_readme_path.write_text(content["readme"], encoding="utf-8")
|
||||
logger.info("Preserved downloaded original README as README.original.md")
|
||||
else:
|
||||
# No local README - download from source
|
||||
content = self._download_readme(model_source)
|
||||
# Save the downloaded original for future use
|
||||
if content["readme"]:
|
||||
original_readme_path.write_text(content["readme"], encoding="utf-8")
|
||||
logger.info("Preserved downloaded original README as README.original.md")
|
||||
|
||||
# Parse frontmatter if present
|
||||
if content["readme"].startswith("---\n"):
|
||||
content = self._parse_frontmatter(content["readme"])
|
||||
|
||||
return content
|
||||
|
||||
def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
|
||||
"""Download README from HuggingFace repository.
|
||||
|
||||
Attempts to download just the README.md file from the source repository
|
||||
for efficient documentation extraction.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content and default metadata.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
logger.info(f"Downloading README from {model_source.source_model}...")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_source.source_model,
|
||||
"--include",
|
||||
"README.md",
|
||||
"--local-dir",
|
||||
temp_dir,
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
readme_path = Path(temp_dir) / "README.md"
|
||||
if readme_path.exists():
|
||||
content["readme"] = readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Downloaded README ({len(content['readme'])} characters)")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Failed to download README: {e}")
|
||||
|
||||
return content
|
||||
|
||||
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
|
||||
"""Parse YAML frontmatter from README.
|
||||
|
||||
Extracts metadata from YAML frontmatter including licence, tags,
|
||||
and other model card fields.
|
||||
|
||||
Returns:
|
||||
Dictionary with separated content and metadata.
|
||||
"""
|
||||
lines = readme_text.split("\n")
|
||||
if lines[0] != "---":
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter_end = -1
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
if line == "---":
|
||||
frontmatter_end = i
|
||||
break
|
||||
|
||||
if frontmatter_end == -1:
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter = "\n".join(lines[1:frontmatter_end])
|
||||
content = "\n".join(lines[frontmatter_end + 1 :])
|
||||
|
||||
# Extract licence
|
||||
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
|
||||
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
|
||||
|
||||
# Extract tags
|
||||
tags = []
|
||||
in_tags = False
|
||||
for line in frontmatter.split("\n"):
|
||||
if line.startswith("tags:"):
|
||||
in_tags = True
|
||||
continue
|
||||
if in_tags:
|
||||
if line.startswith("- "):
|
||||
tags.append(line[2:].strip())
|
||||
elif line and not line.startswith(" "):
|
||||
break
|
||||
|
||||
return {
|
||||
"readme": content,
|
||||
"licence": licence_val,
|
||||
"tags": ",".join(tags),
|
||||
"frontmatter": frontmatter,
|
||||
}
|
||||
|
||||
def _generate_readme_content(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_content: dict[str, str],
|
||||
output_repo: str | None = None,
|
||||
) -> str:
|
||||
"""Generate complete README content with quantisation details.
|
||||
|
||||
Creates the full README including YAML frontmatter, quantisation status
|
||||
table, and original model information.
|
||||
|
||||
Returns:
|
||||
Complete README markdown content.
|
||||
"""
|
||||
# Build tags
|
||||
our_tags = [
|
||||
"quantised",
|
||||
"gguf",
|
||||
"q3_k_m",
|
||||
"q3_k_l",
|
||||
"q3_k_xl",
|
||||
"q4_k_m",
|
||||
"q4_k_l",
|
||||
"q5_k_m",
|
||||
"q5_k_l",
|
||||
"q6_k",
|
||||
"q6_k_l",
|
||||
"q8_0",
|
||||
"bartowski-method",
|
||||
]
|
||||
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
|
||||
all_tags = sorted(set(our_tags + original_tags))
|
||||
|
||||
# Build frontmatter
|
||||
frontmatter = f"""---
|
||||
license: {original_content["licence"]}
|
||||
library_name: gguf
|
||||
base_model: {model_source.source_model}
|
||||
tags:
|
||||
"""
|
||||
for tag in all_tags:
|
||||
if tag.strip():
|
||||
frontmatter += f"- {tag.strip()}\n"
|
||||
|
||||
frontmatter += "---\n\n"
|
||||
|
||||
# Build main content
|
||||
hf_url = f"https://huggingface.co/{model_source.source_model}"
|
||||
content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
|
||||
|
||||
GGUF quantisations of [{model_source.source_model}]({hf_url}) using
|
||||
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
|
||||
which replicates Bartowski's quantisation profiles.
|
||||
|
||||
| Variant | Configuration | File Size | Status |
|
||||
|---|---|---|---|
|
||||
"""
|
||||
|
||||
# Add results table - group by layer config patterns
|
||||
supported_types = [
|
||||
QuantisationType.Q3_K_M,
|
||||
QuantisationType.Q3_K_L,
|
||||
QuantisationType.Q3_K_XL,
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
QuantisationType.Q5_K_M,
|
||||
QuantisationType.Q5_K_L,
|
||||
QuantisationType.Q6_K,
|
||||
QuantisationType.Q6_K_L,
|
||||
QuantisationType.Q8_0,
|
||||
]
|
||||
|
||||
for quant_type in supported_types:
|
||||
result = results.get(quant_type)
|
||||
if not result:
|
||||
result = type("Result", (), {"status": "planned", "success": False})()
|
||||
|
||||
config = QUANTISATION_CONFIGS.get(quant_type)
|
||||
file_size = self._format_file_size(result)
|
||||
status = self._format_status(result, model_source, quant_type, output_repo)
|
||||
|
||||
# Get configuration description from the config itself
|
||||
config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
|
||||
|
||||
content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
|
||||
|
||||
content += """
|
||||
|
||||
**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN
|
||||
|
||||
See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
|
||||
for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
|
||||
for more on the tools and methods I use.
|
||||
|
||||
"""
|
||||
|
||||
# Add original content
|
||||
if original_content["readme"]:
|
||||
content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
|
||||
else:
|
||||
content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model})."
|
||||
|
||||
return frontmatter + content
|
||||
|
||||
def _format_file_size(self, result: QuantisationResult) -> str:
|
||||
"""Format file size for README table.
|
||||
|
||||
Returns:
|
||||
Formatted file size string or dash if not available.
|
||||
"""
|
||||
if hasattr(result, "file_size") and result.file_size:
|
||||
return result.file_size
|
||||
if hasattr(result, "success") and result.success and hasattr(result, "file_path"):
|
||||
# Try to get file size from path if available
|
||||
try:
|
||||
if result.file_path and Path(result.file_path).exists():
|
||||
size_bytes = Path(result.file_path).stat().st_size
|
||||
size_gb = size_bytes / GIBIBYTE
|
||||
return f"{size_gb:.1f}GB"
|
||||
except Exception:
|
||||
pass
|
||||
return "-"
|
||||
|
||||
def _format_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format status indicator for README table.
|
||||
|
||||
Creates appropriate status indicator based on quantisation state
|
||||
including progress indicators, file sizes, and download links.
|
||||
|
||||
Returns:
|
||||
Formatted status string for table cell.
|
||||
"""
|
||||
status_map = {
|
||||
"planned": "⏳ Queued",
|
||||
"processing": "🔄 Processing...",
|
||||
"uploading": "⬆️ Uploading...",
|
||||
"failed": "❌ Failed",
|
||||
}
|
||||
|
||||
if hasattr(result, "status") and result.status in status_map:
|
||||
base_status = status_map[result.status]
|
||||
|
||||
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
|
||||
return f"{base_status} ({result.file_size})"
|
||||
if result.status == "completed" or (hasattr(result, "success") and result.success):
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return base_status
|
||||
|
||||
# Legacy support
|
||||
if hasattr(result, "success") and result.success:
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return "❌ Failed"
|
||||
|
||||
def _format_success_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format successful quantisation status with download link.
|
||||
|
||||
Creates a download link if repository information is available,
|
||||
otherwise shows file size.
|
||||
|
||||
Returns:
|
||||
Formatted success status string.
|
||||
"""
|
||||
if not output_repo:
|
||||
return (
|
||||
f"✅ {result.file_size}"
|
||||
if hasattr(result, "file_size") and result.file_size
|
||||
else "✅ Available"
|
||||
)
|
||||
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
|
||||
)
|
||||
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
|
||||
|
||||
if hasattr(result, "file_size") and result.file_size:
|
||||
return f"[✅ {result.file_size}]({url})"
|
||||
return f"[✅ Available]({url})"
|
|
@ -1,83 +0,0 @@
|
|||
"""Importance matrix (imatrix) management service.
|
||||
|
||||
Manages detection and use of existing importance matrix files for
|
||||
quantisation guidance. Provides user prompts for supplying pre-computed
|
||||
imatrix files from external sources.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class IMatrixManager:
|
||||
"""Handles importance matrix file management for quantisation.
|
||||
|
||||
Locates existing importance matrix files or prompts users to provide
|
||||
pre-computed matrices from external sources. These matrices guide
|
||||
quantisation decisions to preserve model quality.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixManager."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def find_imatrix(self, model_dir: Path) -> Path | None:
|
||||
"""Find or prompt for importance matrix file.
|
||||
|
||||
Searches for existing imatrix files first, then provides interactive
|
||||
prompts for user-supplied matrices. See docs/imatrix_data.md for
|
||||
instructions on generating imatrix files.
|
||||
|
||||
Returns:
|
||||
Path to imatrix file, or None if not available.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
# Check for existing imatrix
|
||||
if imatrix_path.exists():
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name}")
|
||||
return imatrix_path
|
||||
|
||||
# Try user-provided imatrix
|
||||
return self._prompt_for_user_imatrix(model_dir, imatrix_path)
|
||||
|
||||
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
|
||||
"""Prompt user for existing imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to user-provided imatrix, or None if not available.
|
||||
"""
|
||||
logger.info(f"Model directory: {model_dir}")
|
||||
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("📊 No existing imatrix file found")
|
||||
logger.info("\nYou have two options:")
|
||||
logger.info(" 1. Provide a pre-computed imatrix file")
|
||||
logger.info(" (💡 see docs/imatrix_data.md to generate your own)")
|
||||
logger.info(" 2. Skip imatrix usage (lower quality quantisation)")
|
||||
logger.info("=" * 70)
|
||||
|
||||
response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
|
||||
|
||||
if response != "y":
|
||||
logger.info("Continuing without imatrix (quantisation quality may be lower)")
|
||||
logger.info("ℹ️ See docs/imatrix_data.md for instructions on generating imatrix files") # noqa: RUF001
|
||||
return None
|
||||
|
||||
logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"✅ Found imatrix file! ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
logger.warning("No imatrix.dat file found - continuing without imatrix")
|
||||
return None
|
|
@ -1,756 +0,0 @@
|
|||
"""Python API wrapper for llama-cpp-python quantisation operations.
|
||||
|
||||
Provides high-level Python interfaces for model quantisation using llama-cpp-python
|
||||
bindings. Implements partial tensor-specific quantisation support through embedding
|
||||
and output tensor type configuration.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, Any, ClassVar, Never
|
||||
|
||||
import psutil
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.gguf import GGUFConverter
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import QuantisationConfig
|
||||
|
||||
# Import llama_cpp when needed
|
||||
try:
|
||||
import llama_cpp
|
||||
from llama_cpp import llama_model_quantize_params
|
||||
|
||||
LLAMA_CPP_AVAILABLE = True
|
||||
except ImportError:
|
||||
LLAMA_CPP_AVAILABLE = False
|
||||
logger.warning("llama-cpp-python not available - falling back to binary mode")
|
||||
|
||||
|
||||
class LlamaCppPythonAPI:
|
||||
"""Python API wrapper for llama.cpp quantisation operations.
|
||||
|
||||
Provides direct Python access to quantisation functionality using llama-cpp-python
|
||||
bindings. Implements partial tensor-specific quantisation through token embedding
|
||||
and output tensor type configuration, which provides differentiation between
|
||||
Q4_K variants even without full per-layer tensor control.
|
||||
"""
|
||||
|
||||
# Mapping of custom variant prefixes to their base types
|
||||
VARIANT_BASE_MAPPING: ClassVar[dict[str, str]] = {
|
||||
"Q3_K_": "Q3_K_M",
|
||||
"Q4_K_": "Q4_K_M",
|
||||
"Q5_K_": "Q5_K_M",
|
||||
"Q6_K_": "Q6_K",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def is_available() -> bool:
|
||||
"""Check if llama-cpp-python is available for use.
|
||||
|
||||
Returns:
|
||||
True if llama-cpp-python bindings are installed and functional.
|
||||
"""
|
||||
return LLAMA_CPP_AVAILABLE
|
||||
|
||||
@staticmethod
|
||||
def get_quantisation_type(config_name: str) -> int:
|
||||
"""Map configuration name to llama_cpp quantisation type constant.
|
||||
|
||||
Supports a wide range of quantisation types from Q2 to Q8, including
|
||||
K-quants and legacy formats. Handles both simple formats (Q4_K_M, Q6_K)
|
||||
and custom suffixed variants (Q4_K_M_L, Q5_K_M_XL) by mapping them to
|
||||
their base types for llama-cpp-python compatibility.
|
||||
|
||||
Returns:
|
||||
llama_cpp quantisation type constant for base quantisation.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If llama-cpp-python is not available.
|
||||
ValueError: If the quantisation type is not supported.
|
||||
"""
|
||||
if not LLAMA_CPP_AVAILABLE:
|
||||
msg = "llama-cpp-python not available"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
# Normalise the config name to extract base type
|
||||
# E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
|
||||
# E.g., "Q4_K_M_XXL" -> "Q4_K_M"
|
||||
config_upper = config_name.upper()
|
||||
|
||||
# Direct mapping for exact matches
|
||||
type_mapping = {
|
||||
# Q2 variants (not recommended but supported)
|
||||
"Q2_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K,
|
||||
"Q2_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K_S,
|
||||
# Q3 K-quants
|
||||
"Q3_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
||||
"Q3_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
# Q4 K-quants (most common)
|
||||
"Q4_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
||||
"Q4_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
# Q5 K-quants
|
||||
"Q5_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
||||
"Q5_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
# Q6_K (single variant)
|
||||
"Q6_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q6_K,
|
||||
# Q8_0 (highest common quantisation)
|
||||
"Q8_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
# Legacy quantisation formats
|
||||
"Q4_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||
"Q4_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||
"Q5_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||
"Q5_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
# IQ (Integer Quantisation) variants - experimental
|
||||
"IQ2_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XXS,
|
||||
"IQ2_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XS,
|
||||
"IQ2_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_S,
|
||||
"IQ2_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_M,
|
||||
"IQ3_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XXS,
|
||||
"IQ3_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XS,
|
||||
"IQ3_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_S,
|
||||
"IQ3_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_M,
|
||||
"IQ4_NL": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_NL,
|
||||
"IQ4_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_XS,
|
||||
# Higher precision formats
|
||||
"F16": llama_cpp.LLAMA_FTYPE_MOSTLY_F16,
|
||||
"BF16": llama_cpp.LLAMA_FTYPE_MOSTLY_BF16,
|
||||
}
|
||||
|
||||
# Try direct lookup first
|
||||
if config_upper in type_mapping:
|
||||
return type_mapping[config_upper]
|
||||
|
||||
# Handle custom variants using base mapping
|
||||
for prefix, base_type in LlamaCppPythonAPI.VARIANT_BASE_MAPPING.items():
|
||||
if config_upper.startswith(prefix) and config_upper not in type_mapping:
|
||||
return type_mapping[base_type]
|
||||
|
||||
# If not found, raise an informative error
|
||||
supported = sorted(type_mapping.keys())
|
||||
msg = (
|
||||
f"Unsupported quantisation type: {config_name}\n"
|
||||
f"Supported types: {', '.join(supported)}\n"
|
||||
f"Custom variants like Q4_K_L, Q4_K_XL are also supported."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
@staticmethod
|
||||
def get_tensor_type_value(type_name: str) -> int:
|
||||
"""Convert tensor type name to llama_cpp constant.
|
||||
|
||||
Maps string tensor type names to their corresponding llama_cpp integer
|
||||
constants for tensor-specific overrides. Provides the foundation for
|
||||
differentiated quantisation strategies across embedding and output layers.
|
||||
|
||||
Returns:
|
||||
Integer value for the tensor type, or 0 if not found.
|
||||
"""
|
||||
if not LLAMA_CPP_AVAILABLE:
|
||||
return 0
|
||||
|
||||
# Build mapping with variant consolidation
|
||||
# All Q3_K variants map to base Q3_K type, same for Q4_K and Q5_K
|
||||
type_mapping = LlamaCppPythonAPI._build_tensor_type_mapping()
|
||||
return type_mapping.get(type_name.upper(), 0)
|
||||
|
||||
@staticmethod
|
||||
def _build_tensor_type_mapping() -> dict[str, int]:
|
||||
"""Build tensor type mapping with variant consolidation.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping type names to GGML constants.
|
||||
"""
|
||||
if not LLAMA_CPP_AVAILABLE:
|
||||
return {}
|
||||
|
||||
# Base mappings
|
||||
return {
|
||||
# Q2 variants
|
||||
"Q2_K": llama_cpp.GGML_TYPE_Q2_K,
|
||||
# Q3 variants - all map to base Q3_K
|
||||
"Q3_K": llama_cpp.GGML_TYPE_Q3_K,
|
||||
"Q3_K_S": llama_cpp.GGML_TYPE_Q3_K,
|
||||
"Q3_K_M": llama_cpp.GGML_TYPE_Q3_K,
|
||||
"Q3_K_L": llama_cpp.GGML_TYPE_Q3_K,
|
||||
# Q4 variants
|
||||
"Q4_0": llama_cpp.GGML_TYPE_Q4_0,
|
||||
"Q4_1": llama_cpp.GGML_TYPE_Q4_1,
|
||||
"Q4_K": llama_cpp.GGML_TYPE_Q4_K,
|
||||
"Q4_K_S": llama_cpp.GGML_TYPE_Q4_K,
|
||||
"Q4_K_M": llama_cpp.GGML_TYPE_Q4_K,
|
||||
# Q5 variants
|
||||
"Q5_0": llama_cpp.GGML_TYPE_Q5_0,
|
||||
"Q5_1": llama_cpp.GGML_TYPE_Q5_1,
|
||||
"Q5_K": llama_cpp.GGML_TYPE_Q5_K,
|
||||
"Q5_K_S": llama_cpp.GGML_TYPE_Q5_K,
|
||||
"Q5_K_M": llama_cpp.GGML_TYPE_Q5_K,
|
||||
# Q6 variant
|
||||
"Q6_K": llama_cpp.GGML_TYPE_Q6_K,
|
||||
# Q8 variant
|
||||
"Q8_0": llama_cpp.GGML_TYPE_Q8_0,
|
||||
# Higher precision
|
||||
"F16": llama_cpp.GGML_TYPE_F16,
|
||||
"F32": llama_cpp.GGML_TYPE_F32,
|
||||
}
|
||||
|
||||
def quantise_model_flexible(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
base_type: str,
|
||||
embedding_type: str | None = None,
|
||||
output_type: str | None = None,
|
||||
imatrix_path: Path | None = None,
|
||||
) -> bool:
|
||||
"""Quantise model with flexible tensor type configuration.
|
||||
|
||||
Provides control over base quantisation type with optional overrides for
|
||||
embeddings and output layers, which are the only tensor-specific controls
|
||||
that work reliably with llama-cpp-python.
|
||||
|
||||
Args:
|
||||
input_path: Path to input GGUF model.
|
||||
output_path: Path for output quantised model.
|
||||
base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
|
||||
embedding_type: Override for token embeddings (None = use base).
|
||||
output_type: Override for output/lm_head layers (None = use base).
|
||||
imatrix_path: Optional importance matrix file.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
|
||||
Examples:
|
||||
# Q4_K_L: Q4_K_M base with Q8_0 embeddings
|
||||
api.quantise_model_flexible(
|
||||
input_path, output_path, "Q4_K_M",
|
||||
embedding_type="Q8_0"
|
||||
)
|
||||
|
||||
# Q3_K_L: Q3_K_M base with Q5_K output
|
||||
api.quantise_model_flexible(
|
||||
input_path, output_path, "Q3_K_M",
|
||||
output_type="Q5_K"
|
||||
)
|
||||
|
||||
# Q3_K_XL: Q3_K_M with both Q8_0 embeddings and Q5_K output
|
||||
api.quantise_model_flexible(
|
||||
input_path, output_path, "Q3_K_M",
|
||||
embedding_type="Q8_0",
|
||||
output_type="Q5_K"
|
||||
)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If llama-cpp-python is not available.
|
||||
"""
|
||||
if not LLAMA_CPP_AVAILABLE:
|
||||
msg = "llama-cpp-python not available for quantisation"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info(f"🔄 Flexible quantisation: {base_type} base")
|
||||
logger.info(f"📝 Input: {input_path}")
|
||||
logger.info(f"📝 Output: {output_path}")
|
||||
|
||||
# Setup phase - create and configure parameters
|
||||
params = self._create_params(base_type, imatrix_path)
|
||||
self._apply_tensor_overrides(params, embedding_type, output_type)
|
||||
|
||||
# Execution phase - perform quantisation
|
||||
try:
|
||||
logger.debug("DEBUG: Starting flexible quantisation execution")
|
||||
result = self._do_quantisation(input_path, output_path, params)
|
||||
logger.debug(f"DEBUG: Flexible quantisation returned: {result}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Flexible quantisation failed with exception: {e}")
|
||||
logger.error("Flexible quantisation traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
return False
|
||||
else:
|
||||
if result == 0:
|
||||
# Verify output file was created and is valid
|
||||
if not output_path.exists():
|
||||
logger.error(
|
||||
f"❌ Quantisation claimed success but output does not exist: {output_path}"
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
output_size = output_path.stat().st_size
|
||||
logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
|
||||
|
||||
if output_size == 0:
|
||||
logger.error("❌ Output file is empty despite success code")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not check output file size: {e}")
|
||||
|
||||
logger.info(f"✅ Quantisation successful: {output_path.name}")
|
||||
return True
|
||||
logger.error(f"❌ Quantisation failed with code: {result}")
|
||||
return False
|
||||
|
||||
def _create_params(
|
||||
self, base_type: str, imatrix_path: Path | None
|
||||
) -> llama_model_quantize_params:
|
||||
"""Create quantisation parameters.
|
||||
|
||||
Returns:
|
||||
Configured quantisation parameters.
|
||||
"""
|
||||
params = llama_model_quantize_params()
|
||||
params.ftype = self.get_quantisation_type(base_type)
|
||||
params.nthread = 8
|
||||
params.allow_requantize = True
|
||||
|
||||
if imatrix_path and imatrix_path.exists():
|
||||
# Convert path to bytes and create c_char_p, then cast to c_void_p
|
||||
imatrix_bytes = str(imatrix_path).encode("utf-8")
|
||||
char_p = ctypes.c_char_p(imatrix_bytes)
|
||||
params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
|
||||
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
|
||||
|
||||
return params
|
||||
|
||||
def _apply_tensor_overrides(
|
||||
self,
|
||||
params: llama_model_quantize_params,
|
||||
embedding_type: str | None,
|
||||
output_type: str | None,
|
||||
) -> None:
|
||||
"""Apply embedding and output tensor type overrides to params.
|
||||
|
||||
These are the only tensor-specific controls that work reliably
|
||||
with llama-cpp-python.
|
||||
"""
|
||||
# Apply embedding override if specified
|
||||
if embedding_type:
|
||||
params.token_embedding_type = self.get_tensor_type_value(embedding_type)
|
||||
logger.info(f"⚙️ Token embedding type: {embedding_type}")
|
||||
|
||||
# Apply output override if specified
|
||||
if output_type:
|
||||
params.output_tensor_type = self.get_tensor_type_value(output_type)
|
||||
params.quantize_output_tensor = True
|
||||
logger.info(f"⚙️ Output tensor type: {output_type}")
|
||||
|
||||
def _do_quantisation(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
params: llama_model_quantize_params,
|
||||
) -> int:
|
||||
"""Perform the quantisation operation.
|
||||
|
||||
Returns:
|
||||
Return code (0 for success).
|
||||
|
||||
Raises:
|
||||
KeyboardInterrupt: If the user interrupts the quantisation process.
|
||||
SystemExit: If the system exits during quantisation.
|
||||
"""
|
||||
logger.debug("DEBUG: Calling llama_cpp.llama_model_quantize")
|
||||
try:
|
||||
# Flush any pending output before calling C library
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
# Temporarily redirect stderr to prevent terminal control issues
|
||||
# Some GGUF models output control sequences that can break the terminal
|
||||
old_stderr_fd = None
|
||||
devnull_fd = None
|
||||
|
||||
try:
|
||||
# Only redirect if not in debug mode to preserve error messages
|
||||
if not logger.isEnabledFor(logging.DEBUG):
|
||||
old_stderr_fd = os.dup(2) # Save current stderr
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
os.dup2(devnull_fd, 2) # Redirect stderr to /dev/null
|
||||
|
||||
# Call the quantization with proper exception handling
|
||||
result = llama_cpp.llama_model_quantize(
|
||||
str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
|
||||
)
|
||||
|
||||
finally:
|
||||
# Restore stderr if we redirected it
|
||||
if old_stderr_fd is not None:
|
||||
os.dup2(old_stderr_fd, 2)
|
||||
os.close(old_stderr_fd)
|
||||
if devnull_fd is not None:
|
||||
os.close(devnull_fd)
|
||||
|
||||
# Flush output after the call
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
except KeyboardInterrupt:
|
||||
logger.error("❌ Quantisation interrupted by user")
|
||||
raise
|
||||
except SystemExit as e:
|
||||
logger.error(f"❌ System exit during quantisation: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ llama_model_quantize call failed: {e}")
|
||||
logger.error("llama_model_quantize call traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
else:
|
||||
logger.debug(f"DEBUG: llama_model_quantize completed with code: {result}")
|
||||
return result
|
||||
|
||||
def quantise_model(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None = None,
|
||||
) -> bool:
|
||||
"""Quantise model using Python API.
|
||||
|
||||
Performs quantisation using llama-cpp-python's direct API access with
|
||||
support for embedding and output tensor type overrides. The L and XL
|
||||
variants use a base type with specific overrides.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If llama-cpp-python is not available.
|
||||
"""
|
||||
if not LLAMA_CPP_AVAILABLE:
|
||||
msg = "llama-cpp-python not available for quantisation"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
# Force cleanup before starting
|
||||
gc.collect()
|
||||
|
||||
# Log initial resource state
|
||||
mem_before = self._log_resource_state("before")
|
||||
|
||||
try:
|
||||
# Validate input
|
||||
if not self._validate_input_file(input_path):
|
||||
return False
|
||||
# Setup parameters
|
||||
params = self._setup_quantisation_params(config, imatrix_path)
|
||||
if params is None:
|
||||
return False
|
||||
# Execute quantisation
|
||||
result = self._execute_quantisation(input_path, output_path, params)
|
||||
# Verify and finalize
|
||||
if result == 0:
|
||||
return self._finalize_successful_quantisation(output_path, mem_before)
|
||||
|
||||
logger.error(f"❌ Quantisation failed with code: {result}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Quantisation failed with exception: {e}")
|
||||
logger.error("Full quantisation traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
# Garbage collect and return false
|
||||
gc.collect()
|
||||
return False
|
||||
|
||||
def _log_resource_state(self, phase: str) -> float:
|
||||
"""Log current resource usage state.
|
||||
|
||||
Args:
|
||||
phase: Description of current phase (e.g., "before", "after").
|
||||
|
||||
Returns:
|
||||
Current memory usage in GB.
|
||||
"""
|
||||
process = psutil.Process()
|
||||
memory_gb = process.memory_info().rss / (1024**3)
|
||||
logger.debug(f"DEBUG: Memory {phase} quantisation: {memory_gb:.2f} GB")
|
||||
logger.debug(f"DEBUG: Open file descriptors: {len(process.open_files())}")
|
||||
if phase == "before":
|
||||
logger.debug(f"DEBUG: Process PID: {process.pid}")
|
||||
return memory_gb
|
||||
|
||||
def _validate_input_file(self, input_path: Path) -> bool:
|
||||
"""Validate input file exists and is readable.
|
||||
|
||||
Args:
|
||||
input_path: Path to input file.
|
||||
|
||||
Returns:
|
||||
True if file is valid, False otherwise.
|
||||
"""
|
||||
logger.debug(f"DEBUG: Starting quantisation of {input_path.name}")
|
||||
logger.info(f"🔄 Quantising {input_path.name}...")
|
||||
logger.debug(f"DEBUG: Input: {input_path}")
|
||||
|
||||
if not input_path.exists():
|
||||
logger.error(f"❌ Input file does not exist: {input_path}")
|
||||
return False
|
||||
|
||||
if not input_path.is_file():
|
||||
logger.error(f"❌ Input path is not a file: {input_path}")
|
||||
return False
|
||||
|
||||
try:
|
||||
input_size = input_path.stat().st_size
|
||||
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
|
||||
if input_size == 0:
|
||||
logger.error("❌ Input file is empty")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not check input file size: {e}")
|
||||
|
||||
return True
|
||||
|
||||
def _setup_quantisation_params(
|
||||
self,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None,
|
||||
) -> llama_model_quantize_params | None:
|
||||
"""Setup quantisation parameters.
|
||||
|
||||
Args:
|
||||
config: Quantisation configuration.
|
||||
imatrix_path: Optional path to importance matrix.
|
||||
|
||||
Returns:
|
||||
Configured parameters or None if setup failed.
|
||||
"""
|
||||
logger.debug("DEBUG: Setting up quantisation parameters")
|
||||
params = llama_model_quantize_params()
|
||||
|
||||
# Set base quantisation type
|
||||
try:
|
||||
params.ftype = self.get_quantisation_type(config.base_type)
|
||||
logger.debug(
|
||||
f"DEBUG: Set ftype to {params.ftype} for {config.base_type} (config: {config.name})"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get quantisation type for {config.name}: {e}")
|
||||
return None
|
||||
|
||||
# Configure basic parameters
|
||||
params.nthread = 8
|
||||
params.allow_requantize = True
|
||||
logger.debug(
|
||||
f"DEBUG: Set nthread={params.nthread}, allow_requantize={params.allow_requantize}"
|
||||
)
|
||||
|
||||
# Add imatrix if available
|
||||
if imatrix_path and imatrix_path.exists():
|
||||
try:
|
||||
# Convert path to bytes and create c_char_p, then cast to c_void_p
|
||||
imatrix_bytes = str(imatrix_path).encode("utf-8")
|
||||
char_p = ctypes.c_char_p(imatrix_bytes)
|
||||
params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
|
||||
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
|
||||
logger.debug(f"DEBUG: imatrix path set: {imatrix_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to set imatrix: {e}")
|
||||
# Continue without imatrix
|
||||
|
||||
# Configure tensor-specific types
|
||||
logger.debug("DEBUG: Configuring tensor-specific types")
|
||||
try:
|
||||
self._configure_tensor_types(params, config)
|
||||
logger.debug("DEBUG: Tensor types configured successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to configure tensor types: {e}")
|
||||
logger.error("Tensor type configuration traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
# Continue with default types
|
||||
|
||||
return params
|
||||
|
||||
def _execute_quantisation(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
params: llama_model_quantize_params,
|
||||
) -> int:
|
||||
"""Execute the actual quantisation with signal handling.
|
||||
|
||||
Args:
|
||||
input_path: Path to input model.
|
||||
output_path: Path for output model.
|
||||
params: Configured quantisation parameters.
|
||||
|
||||
Returns:
|
||||
Return code from quantisation (0 for success).
|
||||
"""
|
||||
logger.debug("DEBUG: Starting llama_cpp.llama_model_quantize call")
|
||||
logger.debug("DEBUG: About to call llama_model_quantize...")
|
||||
|
||||
# Setup signal handlers
|
||||
old_handlers = self._setup_signal_handlers()
|
||||
|
||||
try:
|
||||
result = llama_cpp.llama_model_quantize(
|
||||
str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
|
||||
)
|
||||
logger.debug(f"DEBUG: llama_model_quantize returned: {result}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ llama_model_quantize raised exception: {e}")
|
||||
logger.error("llama_model_quantize traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
return -1
|
||||
else:
|
||||
return result
|
||||
finally:
|
||||
self._restore_signal_handlers(old_handlers)
|
||||
|
||||
def _setup_signal_handlers(self) -> tuple[Any, Any | None]:
|
||||
"""Setup signal handlers for debugging termination.
|
||||
|
||||
Returns:
|
||||
Tuple of (old_sigterm, old_sigsegv) handlers.
|
||||
"""
|
||||
|
||||
def signal_debug_handler(signum: int, frame: object) -> Never: # noqa: ARG001
|
||||
logger.error(f"DEBUG: Received signal {signum} during quantisation!")
|
||||
logger.error(f"DEBUG: Signal name: {signal.Signals(signum).name}")
|
||||
msg = f"Signal {signum} received"
|
||||
raise KeyboardInterrupt(msg)
|
||||
|
||||
old_sigterm = signal.signal(signal.SIGTERM, signal_debug_handler)
|
||||
old_sigsegv = (
|
||||
signal.signal(signal.SIGSEGV, signal_debug_handler)
|
||||
if hasattr(signal, "SIGSEGV")
|
||||
else None
|
||||
)
|
||||
return old_sigterm, old_sigsegv
|
||||
|
||||
def _restore_signal_handlers(self, handlers: tuple[Any, Any | None]) -> None:
|
||||
"""Restore original signal handlers.
|
||||
|
||||
Args:
|
||||
handlers: Tuple of (old_sigterm, old_sigsegv) handlers.
|
||||
"""
|
||||
old_sigterm, old_sigsegv = handlers
|
||||
signal.signal(signal.SIGTERM, old_sigterm)
|
||||
if old_sigsegv is not None:
|
||||
signal.signal(signal.SIGSEGV, old_sigsegv)
|
||||
|
||||
def _finalize_successful_quantisation(
|
||||
self,
|
||||
output_path: Path,
|
||||
mem_before: float,
|
||||
) -> bool:
|
||||
"""Finalize successful quantisation and verify output.
|
||||
|
||||
Args:
|
||||
output_path: Path to output file.
|
||||
mem_before: Memory usage before quantisation in GB.
|
||||
|
||||
Returns:
|
||||
True if output is valid, False otherwise.
|
||||
"""
|
||||
logger.debug("DEBUG: Quantisation returned success code")
|
||||
|
||||
# Verify output exists
|
||||
if not output_path.exists():
|
||||
logger.error(
|
||||
f"❌ Quantisation claimed success but output does not exist: {output_path}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Verify output size
|
||||
output_size = output_path.stat().st_size
|
||||
logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
|
||||
|
||||
if output_size == 0:
|
||||
logger.error("❌ Output file is empty despite success code")
|
||||
return False
|
||||
|
||||
logger.info(f"✅ Quantisation successful: {output_path.name}")
|
||||
|
||||
# Force cleanup and log final state
|
||||
gc.collect()
|
||||
mem_after = self._log_resource_state("after")
|
||||
logger.debug(f"DEBUG: Memory delta: {mem_after - mem_before:+.2f} GB")
|
||||
|
||||
return True
|
||||
|
||||
def _configure_tensor_types(
|
||||
self, params: llama_model_quantize_params, config: QuantisationConfig
|
||||
) -> None:
|
||||
"""Configure tensor-specific quantisation types.
|
||||
|
||||
Sets embedding and output tensor type overrides based on config.
|
||||
These are the only tensor-specific controls that work reliably
|
||||
with llama-cpp-python.
|
||||
"""
|
||||
logger.debug(f"DEBUG: _configure_tensor_types called for {config.name}")
|
||||
|
||||
# Apply embedding override if specified
|
||||
if config.embedding_type:
|
||||
params.token_embedding_type = self.get_tensor_type_value(config.embedding_type)
|
||||
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
|
||||
|
||||
# Apply output override if specified
|
||||
if config.output_type:
|
||||
params.output_tensor_type = self.get_tensor_type_value(config.output_type)
|
||||
params.quantize_output_tensor = True
|
||||
logger.info(f"⚙️ Output tensor type: {config.output_type}")
|
||||
|
||||
def convert_hf_to_gguf(
|
||||
self, input_dir: Path, output_path: Path, output_type: str = "f16"
|
||||
) -> bool:
|
||||
"""Convert HuggingFace model to GGUF format using native Python converter.
|
||||
|
||||
Uses our GGUFConverter for SafeTensors models, providing full Python-based
|
||||
conversion without external dependencies.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"🔄 Converting {input_dir.name} to GGUF format...")
|
||||
logger.info(f"📝 Input: {input_dir}")
|
||||
logger.info(f"📝 Output: {output_path}")
|
||||
logger.info(f"📝 Type: {output_type}")
|
||||
|
||||
# Check for SafeTensors files
|
||||
safetensor_files = list(input_dir.glob("*.safetensors"))
|
||||
if not safetensor_files:
|
||||
logger.warning("⚠️ No SafeTensors files found in model directory")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Load model configuration
|
||||
config_parser = ConfigParser()
|
||||
model_config = config_parser.load_model_config(input_dir)
|
||||
|
||||
# Get architecture mapping
|
||||
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
|
||||
arch = config_parser.get_architecture_mapping(arch_name)
|
||||
|
||||
if arch != arch_name:
|
||||
logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
|
||||
|
||||
# Convert using GGUFConverter
|
||||
tensor_mapper = TensorMapper()
|
||||
success = GGUFConverter.convert_safetensors(
|
||||
input_dir, output_path, model_config, arch, tensor_mapper
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Conversion failed with exception: {e}")
|
||||
return False
|
||||
else:
|
||||
if success:
|
||||
logger.info("✅ Native Python conversion successful")
|
||||
return success
|
|
@ -1,618 +0,0 @@
|
|||
"""Quantisation orchestration service.
|
||||
|
||||
High-level orchestration of the complete quantisation workflow from model
|
||||
acquisition through processing to upload. Manages parallel processing,
|
||||
status tracking, and cleanup operations for efficient resource utilisation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import signal
|
||||
import sys
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import psutil
|
||||
|
||||
from helpers.config.quantisation_configs import (
|
||||
DEFAULT_QUANTISATION_TYPES,
|
||||
QUANTISATION_CONFIGS,
|
||||
SUPPORTED_QUANTISATION_TYPES,
|
||||
)
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.huggingface import ReadmeGenerator
|
||||
from helpers.services.llama_cpp import IMatrixManager
|
||||
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
||||
from helpers.utils.tensor_mapping import URLParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import FrameType
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class QuantisationOrchestrator:
|
||||
"""Orchestrates the complete quantisation workflow.
|
||||
|
||||
Uses dataclass with slots for efficient memory usage and dependency injection
|
||||
for modular service interaction following SOLID principles.
|
||||
"""
|
||||
|
||||
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
||||
use_imatrix: bool = True
|
||||
no_upload: bool = False
|
||||
custom_profiles: list[str] | None = None
|
||||
|
||||
# Service dependencies with factory defaults
|
||||
url_parser: URLParser = field(default_factory=URLParser)
|
||||
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
||||
imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
|
||||
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
||||
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
||||
|
||||
# Computed properties
|
||||
models_dir: Path = field(init=False)
|
||||
model_manager: ModelManager = field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialise computed properties after dataclass construction."""
|
||||
self.models_dir = self.work_dir / "models"
|
||||
self.model_manager = ModelManager(self.models_dir)
|
||||
|
||||
# Set up signal handlers for graceful exit tracking
|
||||
self._setup_signal_handlers()
|
||||
|
||||
def _setup_signal_handlers(self) -> None:
|
||||
"""Set up signal handlers to catch unexpected exits."""
|
||||
|
||||
def signal_handler(signum: int, frame: FrameType | None) -> None:
|
||||
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
|
||||
logger.error("Stack trace at signal:")
|
||||
if frame:
|
||||
for line in traceback.format_stack(frame):
|
||||
logger.error(f" {line.strip()}")
|
||||
logger.error("Exiting due to signal")
|
||||
sys.exit(1)
|
||||
|
||||
# Handle common termination signals
|
||||
for sig in [signal.SIGINT, signal.SIGTERM]:
|
||||
signal.signal(sig, signal_handler)
|
||||
|
||||
def get_quantisation_types(self) -> list[QuantisationType]:
|
||||
"""Get the quantisation types to use for this run.
|
||||
|
||||
Returns:
|
||||
List of QuantisationType enums to process.
|
||||
"""
|
||||
if self.custom_profiles:
|
||||
# Parse custom profiles from strings to QuantisationType
|
||||
result = []
|
||||
for profile_str in self.custom_profiles:
|
||||
try:
|
||||
profile = QuantisationType(profile_str.upper())
|
||||
if profile in SUPPORTED_QUANTISATION_TYPES:
|
||||
result.append(profile)
|
||||
else:
|
||||
logger.warning(f"Profile {profile_str} is not supported, skipping")
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid profile {profile_str}, skipping")
|
||||
return result or DEFAULT_QUANTISATION_TYPES
|
||||
return DEFAULT_QUANTISATION_TYPES
|
||||
|
||||
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
|
||||
Raises:
|
||||
KeyboardInterrupt: If the user interrupts the quantisation process.
|
||||
"""
|
||||
logger.info("Starting Bartowski quantisation process...")
|
||||
logger.debug(f"DEBUG: Input URL: {url}")
|
||||
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
|
||||
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
|
||||
logger.debug(f"DEBUG: No upload: {self.no_upload}")
|
||||
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
|
||||
|
||||
try:
|
||||
# Setup and preparation
|
||||
logger.debug("DEBUG: Starting environment setup...")
|
||||
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
|
||||
logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
|
||||
|
||||
# Create initial repository
|
||||
logger.debug("DEBUG: Creating initial repository...")
|
||||
self._create_initial_repository(model_source, output_repo)
|
||||
logger.debug("DEBUG: Initial repository created")
|
||||
|
||||
# Execute all quantisations
|
||||
logger.debug("DEBUG: Starting quantisation execution...")
|
||||
results = self._execute_quantisations(
|
||||
model_source, f16_model_path, imatrix_path, output_repo
|
||||
)
|
||||
logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
|
||||
|
||||
# Cleanup
|
||||
logger.debug("DEBUG: Starting cleanup...")
|
||||
self._cleanup_files(f16_model_path, model_source)
|
||||
logger.debug("DEBUG: Cleanup complete")
|
||||
|
||||
self._print_completion_summary(model_source, results, output_repo)
|
||||
except KeyboardInterrupt:
|
||||
logger.error("❌ Process interrupted by user (Ctrl+C)")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error in quantisation workflow: {e}")
|
||||
logger.error("Full traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
else:
|
||||
return results
|
||||
|
||||
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
|
||||
"""Setup environment and prepare model for quantisation.
|
||||
|
||||
Returns:
|
||||
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
|
||||
"""
|
||||
model_source = self.url_parser.parse(url)
|
||||
self._print_model_info(model_source)
|
||||
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
f16_model_path = self.model_manager.prepare_model(model_source)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Checking for importance matrix (imatrix)...")
|
||||
imatrix_path = self.imatrix_manager.find_imatrix(
|
||||
self.models_dir / model_source.model_name
|
||||
)
|
||||
|
||||
output_repo = (
|
||||
f"{self.uploader.get_username()}/"
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
)
|
||||
|
||||
return model_source, f16_model_path, imatrix_path, output_repo
|
||||
|
||||
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
||||
"""Create initial repository with planned quantisations."""
|
||||
logger.info("Creating initial README with planned quantisations...")
|
||||
quantisation_types = self.get_quantisation_types()
|
||||
planned_results = {
|
||||
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
||||
for qt in quantisation_types
|
||||
}
|
||||
readme_path = self.readme_generator.generate(
|
||||
model_source, planned_results, self.models_dir, output_repo
|
||||
)
|
||||
|
||||
if not self.no_upload:
|
||||
logger.info("Creating repository with planned quantisations...")
|
||||
self.uploader.upload_readme(output_repo, readme_path)
|
||||
else:
|
||||
logger.info("Skipping repository creation (--no-upload specified)")
|
||||
|
||||
def _execute_quantisations(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Execute all quantisation types with parallel uploads.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
"""
|
||||
results: dict[QuantisationType, QuantisationResult] = {}
|
||||
|
||||
quantisation_types = self.get_quantisation_types()
|
||||
types_list = [qt.value for qt in quantisation_types]
|
||||
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
|
||||
|
||||
# Process with parallel uploads - quantise sequentially but upload in background
|
||||
upload_futures = []
|
||||
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
|
||||
for i, quant_type in enumerate(quantisation_types, 1):
|
||||
logger.info(
|
||||
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
|
||||
)
|
||||
logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
|
||||
logger.debug(f"DEBUG: Current type: {quant_type.value}")
|
||||
logger.debug(f"DEBUG: Results so far: {len(results)} completed")
|
||||
|
||||
try:
|
||||
result = self._process_single_quantisation(
|
||||
quant_type,
|
||||
model_source,
|
||||
f16_model_path,
|
||||
imatrix_path,
|
||||
output_repo,
|
||||
results,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
results[quant_type] = result
|
||||
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
|
||||
|
||||
# Force cleanup between quantisations
|
||||
gc.collect()
|
||||
logger.debug("DEBUG: Garbage collection completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
results[quant_type] = QuantisationResult(
|
||||
quantisation_type=quant_type,
|
||||
success=False,
|
||||
status="failed",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
# Force cleanup after error
|
||||
gc.collect()
|
||||
|
||||
# Wait for all uploads to complete before returning
|
||||
self._wait_for_uploads(upload_futures)
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_quantisation(
|
||||
self,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> QuantisationResult:
|
||||
"""Process a single quantisation type.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Result of the quantisation attempt.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting {quant_type.value} quantisation...")
|
||||
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
|
||||
config = QUANTISATION_CONFIGS[quant_type]
|
||||
logger.debug(f"DEBUG: Config loaded: {config.name}")
|
||||
|
||||
# Update status to processing
|
||||
logger.debug("DEBUG: Creating initial quantisation result")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "processing"
|
||||
results[quant_type] = result
|
||||
|
||||
logger.debug("DEBUG: Updating README status")
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
# Perform quantisation
|
||||
logger.debug("DEBUG: Creating quantisation context")
|
||||
context = QuantisationContext(
|
||||
f16_model_path=f16_model_path,
|
||||
model_source=model_source,
|
||||
config=config,
|
||||
models_dir=self.models_dir,
|
||||
imatrix_path=imatrix_path,
|
||||
)
|
||||
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
|
||||
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
|
||||
logger.debug("DEBUG: Calling quantisation engine...")
|
||||
result = self.quantisation_engine.quantise(context)
|
||||
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
|
||||
|
||||
self._handle_quantisation_result(
|
||||
result,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
output_repo,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
except Exception as e:
|
||||
return self._handle_quantisation_error(
|
||||
e, quant_type, model_source, results, output_repo
|
||||
)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _process_single_quantisation_sequential(
|
||||
self,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
) -> QuantisationResult:
|
||||
"""Process a single quantisation type sequentially with immediate upload.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Result of the quantisation attempt.
|
||||
"""
|
||||
# Force cleanup before starting new quantisation
|
||||
gc.collect()
|
||||
|
||||
# Log system state before quantisation
|
||||
process = psutil.Process()
|
||||
logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
|
||||
logger.debug(f"DEBUG: Process alive: {process.is_running()}")
|
||||
logger.debug(f"DEBUG: PID: {process.pid}")
|
||||
logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
|
||||
logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
|
||||
logger.debug(f"DEBUG: Threads: {process.num_threads()}")
|
||||
logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
|
||||
|
||||
try:
|
||||
logger.info(f"Starting {quant_type.value} quantisation...")
|
||||
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
|
||||
config = QUANTISATION_CONFIGS[quant_type]
|
||||
logger.debug(f"DEBUG: Config loaded: {config.name}")
|
||||
|
||||
# Update status to processing
|
||||
logger.debug("DEBUG: Creating initial quantisation result")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "processing"
|
||||
results[quant_type] = result
|
||||
|
||||
logger.debug("DEBUG: Updating README status")
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
# Perform quantisation
|
||||
logger.debug("DEBUG: Creating quantisation context")
|
||||
context = QuantisationContext(
|
||||
f16_model_path=f16_model_path,
|
||||
model_source=model_source,
|
||||
config=config,
|
||||
models_dir=self.models_dir,
|
||||
imatrix_path=imatrix_path,
|
||||
)
|
||||
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
|
||||
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
|
||||
logger.debug("DEBUG: Calling quantisation engine...")
|
||||
result = self.quantisation_engine.quantise(context)
|
||||
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
|
||||
|
||||
if result.success and result.file_path:
|
||||
# Upload immediately (if not in no-upload mode)
|
||||
if not self.no_upload:
|
||||
logger.info(f"Uploading {quant_type.value}...")
|
||||
try:
|
||||
self.uploader.upload_model_file(output_repo, result.file_path)
|
||||
logger.info(f"Upload of {quant_type.value} completed successfully")
|
||||
|
||||
# Clean up file after successful upload
|
||||
logger.info(f"Removing {result.file_path.name} to save disk space...")
|
||||
result.file_path.unlink()
|
||||
|
||||
result.status = "completed"
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as upload_error:
|
||||
logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
|
||||
result.status = "failed"
|
||||
result.error_message = str(upload_error)
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
# Keep file if upload failed
|
||||
else:
|
||||
# No upload mode - just mark as completed
|
||||
result.status = "completed"
|
||||
logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
|
||||
else:
|
||||
result.status = "failed"
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {quant_type.value}: {e}")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "failed"
|
||||
result.error_message = str(e)
|
||||
|
||||
try:
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as readme_error:
|
||||
logger.error(f"Failed to update README after error: {readme_error}")
|
||||
# Force cleanup after error
|
||||
gc.collect()
|
||||
return result
|
||||
else:
|
||||
# Force cleanup after quantisation
|
||||
gc.collect()
|
||||
return result
|
||||
|
||||
def _handle_quantisation_result(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> None:
|
||||
"""Handle successful or failed quantisation result."""
|
||||
if result.success and result.file_path:
|
||||
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
||||
logger.info(f"Starting parallel upload of {quant_str}...")
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_and_cleanup,
|
||||
output_repo,
|
||||
result.file_path,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
result.file_path = None # Mark as being uploaded
|
||||
result.status = "uploading"
|
||||
else:
|
||||
result.status = "failed"
|
||||
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
def _handle_quantisation_error(
|
||||
self,
|
||||
error: Exception,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> QuantisationResult:
|
||||
"""Handle quantisation processing error.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Failed quantisation result with error information.
|
||||
"""
|
||||
logger.error(f"Error processing {quant_type.value}: {error}")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "failed"
|
||||
result.error_message = str(error)
|
||||
|
||||
try:
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as readme_error:
|
||||
logger.error(f"Failed to update README after error: {readme_error}")
|
||||
|
||||
return result
|
||||
|
||||
def _update_readme_status(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Update README with current quantisation status."""
|
||||
if not self.no_upload:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
def _wait_for_uploads(self, upload_futures: list) -> None:
|
||||
"""Wait for all parallel uploads to complete."""
|
||||
logger.info("Waiting for any remaining uploads to complete...")
|
||||
for future in upload_futures:
|
||||
try:
|
||||
future.result(timeout=300) # 5 minute timeout per upload
|
||||
except Exception as e:
|
||||
logger.warning(f"Upload error: {e}")
|
||||
|
||||
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
||||
"""Clean up temporary files after processing."""
|
||||
if f16_model_path.exists():
|
||||
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
||||
f16_model_path.unlink()
|
||||
|
||||
if not model_source.is_gguf_repo:
|
||||
self._cleanup_original_model(model_source)
|
||||
|
||||
def _cleanup_original_model(self, model_source: ModelSource) -> None:
|
||||
"""Clean up original safetensors/PyTorch files after successful conversion."""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
|
||||
if pytorch_files:
|
||||
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
|
||||
for file in pytorch_files:
|
||||
file.unlink()
|
||||
|
||||
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
||||
|
||||
def _upload_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
) -> None:
|
||||
"""Upload file and clean up (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
|
||||
|
||||
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
|
||||
file_path.unlink()
|
||||
|
||||
results[quant_type].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
||||
results[quant_type].status = "failed"
|
||||
results[quant_type].error_message = str(e)
|
||||
|
||||
try:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
except Exception as readme_error:
|
||||
logger.error(
|
||||
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
|
||||
)
|
||||
# Don't re-raise - let other uploads continue
|
||||
|
||||
def _print_model_info(self, model_source: ModelSource) -> None:
|
||||
"""Print model information."""
|
||||
logger.info(f"Source URL: {model_source.url}")
|
||||
logger.info(f"Source model: {model_source.source_model}")
|
||||
logger.info(f"Original author: {model_source.original_author}")
|
||||
logger.info(f"Model name: {model_source.model_name}")
|
||||
logger.info(f"Your HF username: {self.uploader.get_username()}")
|
||||
logger.info(f"Working directory: {self.work_dir}")
|
||||
|
||||
def _print_completion_summary(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Print completion summary."""
|
||||
successful_results = [r for r in results.values() if r.success]
|
||||
|
||||
if successful_results:
|
||||
logger.info("Complete! Your quantised models are available at:")
|
||||
logger.info(f" https://huggingface.co/{output_repo}")
|
||||
logger.info("Model info:")
|
||||
logger.info(f" - Source URL: {model_source.url}")
|
||||
logger.info(f" - Original: {model_source.source_model}")
|
||||
logger.info(
|
||||
" - Method: "
|
||||
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
||||
)
|
||||
logger.info(f" - Quantised: {output_repo}")
|
||||
|
||||
for result in successful_results:
|
||||
if result.file_size:
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-"
|
||||
f"{result.quantisation_type}.gguf"
|
||||
)
|
||||
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
||||
else:
|
||||
logger.error(
|
||||
"All quantisations failed - repository created with documentation "
|
||||
"but no model files"
|
||||
)
|
||||
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|
|
@ -1,675 +0,0 @@
|
|||
"""Quantisation operations service.
|
||||
|
||||
Provides modular quantisation engine, model management, and upload capabilities
|
||||
for GGUF model processing. Consolidates quantisation logic from various tools
|
||||
into reusable components following SOLID principles.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.services.gguf import GGUFConverter
|
||||
from helpers.services.llama_python import LlamaCppPythonAPI
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
|
||||
class QuantisationEngine:
|
||||
"""Handles the actual quantisation process with configurable methods.
|
||||
|
||||
Provides flexible quantisation execution supporting multiple tensor
|
||||
precision configurations, importance matrices, and fallback strategies.
|
||||
Uses llama-cpp-python API for direct quantisation without subprocess overhead.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation engine."""
|
||||
self.fs = FilesystemService()
|
||||
self.python_api = LlamaCppPythonAPI()
|
||||
|
||||
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
||||
"""Perform quantisation using the specified configuration.
|
||||
|
||||
Executes quantisation using Python API. Since llama-cpp-python is a
|
||||
required dependency, we can rely on it being available.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with success status and file information.
|
||||
"""
|
||||
logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
|
||||
logger.info(
|
||||
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
||||
)
|
||||
|
||||
output_path = context.get_output_path()
|
||||
logger.debug(f"DEBUG: Output path: {output_path}")
|
||||
|
||||
# Check input file exists and is readable
|
||||
if not context.f16_model_path.exists():
|
||||
error_msg = f"Input model file does not exist: {context.f16_model_path}"
|
||||
logger.error(f"❌ {error_msg}")
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message=error_msg,
|
||||
)
|
||||
|
||||
# Check if we have enough disk space (rough estimate)
|
||||
try:
|
||||
input_size = context.f16_model_path.stat().st_size
|
||||
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
|
||||
# This is a rough check - actual available space calculation is more complex
|
||||
logger.debug(f"DEBUG: Output directory: {output_path.parent}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not check disk space: {e}")
|
||||
|
||||
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
||||
logger.debug(f"DEBUG: Source: {context.f16_model_path}")
|
||||
logger.debug(f"DEBUG: Target: {output_path}")
|
||||
logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
|
||||
|
||||
try:
|
||||
# Use Python API for quantisation
|
||||
logger.info("🐍 Using Python API for quantisation...")
|
||||
logger.debug("DEBUG: Calling python_api.quantise_model...")
|
||||
|
||||
success = self.python_api.quantise_model(
|
||||
context.f16_model_path, output_path, context.config, context.imatrix_path
|
||||
)
|
||||
|
||||
logger.debug(f"DEBUG: Python API returned: {success}")
|
||||
|
||||
if success:
|
||||
logger.debug("DEBUG: Quantisation successful, creating success result")
|
||||
return self._create_success_result(context.config.name, output_path, "Python API")
|
||||
|
||||
logger.error(f"❌ {context.config.name} quantisation failed")
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message="Quantisation failed via Python API",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message=f"Exception during quantisation: {e!s}",
|
||||
)
|
||||
|
||||
def _create_success_result(
|
||||
self, quant_type: str, output_path: Path, method_used: str
|
||||
) -> QuantisationResult:
|
||||
"""Create successful quantisation result with file metadata.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with file path and size information.
|
||||
"""
|
||||
file_size = self.fs.get_file_size(output_path)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(quant_type),
|
||||
success=True,
|
||||
file_path=output_path,
|
||||
file_size=file_size,
|
||||
method_used=method_used,
|
||||
)
|
||||
|
||||
|
||||
class ModelManager:
|
||||
"""Handles model downloading and preparation for quantisation.
|
||||
|
||||
Manages both GGUF repository downloads and HuggingFace model conversions,
|
||||
providing unified interface for model acquisition and preparation.
|
||||
"""
|
||||
|
||||
def __init__(self, models_dir: Path) -> None:
|
||||
"""Initialise model manager with storage configuration.
|
||||
|
||||
Sets up model storage directory for model downloads and conversions.
|
||||
"""
|
||||
self.models_dir = models_dir
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def prepare_model(self, model_source: ModelSource) -> Path:
|
||||
"""Prepare model for quantisation and return F16 model path.
|
||||
|
||||
Handles both GGUF repository downloads and regular HuggingFace model
|
||||
conversion workflows with automatic format detection.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model ready for quantisation.
|
||||
"""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
if model_source.is_gguf_repo:
|
||||
return self._handle_gguf_repo(model_source, model_dir)
|
||||
return self._handle_regular_repo(model_source, model_dir)
|
||||
|
||||
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Handle GGUF repository download with pattern matching.
|
||||
|
||||
Downloads GGUF files matching specified patterns, prioritising
|
||||
multi-part files and F16 variants.
|
||||
|
||||
Returns:
|
||||
Path to downloaded or existing GGUF file.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
||||
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
||||
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
||||
return f16_model
|
||||
|
||||
# Check for existing GGUF files
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
existing_gguf = self.fs.find_gguf_files(model_dir)
|
||||
|
||||
if existing_gguf:
|
||||
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
|
||||
return existing_gguf[0]
|
||||
|
||||
# Download with patterns
|
||||
downloaded_file = self._download_gguf_with_patterns(
|
||||
model_source.source_model, model_source.gguf_file_pattern, model_dir
|
||||
)
|
||||
|
||||
if downloaded_file:
|
||||
# Handle multi-part files
|
||||
if "00001-of-" in downloaded_file.name:
|
||||
return downloaded_file
|
||||
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
|
||||
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
|
||||
"-00003-of-", "-00001-of-"
|
||||
)
|
||||
first_part = downloaded_file.parent / base_name
|
||||
if first_part.exists():
|
||||
logger.info(f"🔄 Using first part: {first_part.name}")
|
||||
return first_part
|
||||
|
||||
# Rename single file to standard name
|
||||
downloaded_file.rename(f16_model)
|
||||
return f16_model
|
||||
|
||||
# Fallback to regular conversion
|
||||
logger.info("💡 Falling back to downloading full repository and converting...")
|
||||
return self._handle_regular_repo(
|
||||
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
|
||||
model_dir,
|
||||
)
|
||||
|
||||
def _download_gguf_with_patterns(
|
||||
self, source_model: str, pattern: str | None, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Download GGUF file using various pattern strategies.
|
||||
|
||||
Tries multiple pattern variations to find and download appropriate
|
||||
GGUF files, handling timeouts and temporary directories.
|
||||
|
||||
Returns:
|
||||
Path to downloaded file, or None if all patterns fail.
|
||||
"""
|
||||
if pattern:
|
||||
patterns = [
|
||||
f"*{pattern}*",
|
||||
f"*{pattern.lower()}*",
|
||||
f"*{pattern.upper()}*",
|
||||
"*f16*",
|
||||
"*F16*",
|
||||
"*fp16*",
|
||||
]
|
||||
else:
|
||||
patterns = ["*f16*", "*F16*", "*fp16*"]
|
||||
|
||||
temp_dir = model_dir / "gguf_temp"
|
||||
|
||||
for search_pattern in patterns:
|
||||
logger.info(f"🔍 Trying pattern: {search_pattern}")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
logger.debug(
|
||||
f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
|
||||
)
|
||||
result = subprocess.run(
|
||||
[
|
||||
"timeout",
|
||||
"300",
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--include",
|
||||
search_pattern,
|
||||
"--local-dir",
|
||||
str(temp_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.debug(
|
||||
f"DEBUG: Download command completed with return code {result.returncode}"
|
||||
)
|
||||
|
||||
# Find downloaded GGUF files
|
||||
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
|
||||
if gguf_files:
|
||||
found_file = gguf_files[0]
|
||||
logger.info(f"✅ Found GGUF file: {found_file.name}")
|
||||
|
||||
# Move to parent directory
|
||||
final_path = model_dir / found_file.name
|
||||
shutil.move(str(found_file), str(final_path))
|
||||
shutil.rmtree(temp_dir)
|
||||
return final_path
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.debug(
|
||||
f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
|
||||
)
|
||||
if e.stderr:
|
||||
logger.debug(f"DEBUG: stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.debug(f"DEBUG: stdout: {e.stdout}")
|
||||
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error during download: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
continue
|
||||
finally:
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return None
|
||||
|
||||
def _handle_regular_repo(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
) -> Path:
|
||||
"""Handle regular HuggingFace repository conversion.
|
||||
|
||||
Downloads full model repository and converts to F16 GGUF format
|
||||
using our native Python-based GGUFConverter for SafeTensors models.
|
||||
|
||||
Returns:
|
||||
Path to converted F16 GGUF model.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
|
||||
|
||||
# Download model if needed
|
||||
if not model_dir.exists():
|
||||
self._download_repository(model_source.source_model, model_dir)
|
||||
else:
|
||||
logger.info("✅ Model already downloaded")
|
||||
|
||||
# Convert to GGUF
|
||||
return self._convert_to_gguf(model_source, model_dir)
|
||||
|
||||
def _download_repository(self, source_model: str, model_dir: Path) -> None:
|
||||
"""Download HuggingFace repository.
|
||||
|
||||
Args:
|
||||
source_model: HuggingFace model identifier.
|
||||
model_dir: Local directory for download.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If download fails.
|
||||
"""
|
||||
try:
|
||||
logger.debug(f"DEBUG: Downloading full repository: {source_model}")
|
||||
result = subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--local-dir",
|
||||
str(model_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.debug(
|
||||
f"DEBUG: Repository download completed with return code {result.returncode}"
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"❌ Failed to download repository {source_model}")
|
||||
logger.error(f"Return code: {e.returncode}")
|
||||
if e.stderr:
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
msg = f"Repository download failed: {e}"
|
||||
raise RuntimeError(msg) from e
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error during repository download: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
|
||||
def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Convert model to GGUF F16 format.
|
||||
|
||||
Args:
|
||||
model_source: Model source information.
|
||||
model_dir: Directory containing model files.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If conversion fails.
|
||||
"""
|
||||
logger.info("🔄 Converting to GGUF F16 format...")
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info("✅ F16 model already exists")
|
||||
return f16_model
|
||||
|
||||
# Check for SafeTensors files
|
||||
safetensor_files = list(model_dir.glob("*.safetensors"))
|
||||
if not safetensor_files:
|
||||
logger.error("❌ Model format not supported")
|
||||
logger.info("💡 This tool supports GGUF and SafeTensors formats")
|
||||
msg = "Model must be in GGUF or SafeTensors format"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("🐍 Using native Python GGUFConverter...")
|
||||
logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
|
||||
|
||||
# Load model configuration
|
||||
config_parser = ConfigParser()
|
||||
model_config = config_parser.load_model_config(model_dir)
|
||||
|
||||
# Get architecture mapping
|
||||
arch_name = model_config.architectures[0] if model_config.architectures else "llama"
|
||||
arch = config_parser.get_architecture_mapping(arch_name)
|
||||
|
||||
if arch != arch_name:
|
||||
logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
|
||||
|
||||
# Convert using GGUFConverter
|
||||
tensor_mapper = TensorMapper()
|
||||
success = GGUFConverter.convert_safetensors(
|
||||
model_dir, f16_model, model_config, arch, tensor_mapper
|
||||
)
|
||||
|
||||
if not success:
|
||||
logger.error("❌ Native Python conversion failed")
|
||||
msg = "Failed to convert SafeTensors model to GGUF"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("✅ Native Python conversion successful")
|
||||
return f16_model
|
||||
|
||||
|
||||
class HuggingFaceUploader:
|
||||
"""Handles uploading models and documentation to HuggingFace.
|
||||
|
||||
Provides methods for repository creation, file uploads, and README
|
||||
updates with proper error handling and retry logic.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Returns:
|
||||
HuggingFace username from CLI authentication.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
|
||||
"""Upload or update README file to repository.
|
||||
|
||||
Creates repository if needed, handles existing repository updates.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the README upload fails.
|
||||
"""
|
||||
logger.info("Uploading README...")
|
||||
|
||||
# First ensure the repository exists
|
||||
self._ensure_repo_exists(output_repo)
|
||||
|
||||
# Upload without --create flag to avoid PR creation
|
||||
try:
|
||||
logger.debug(f"DEBUG: Uploading README to {output_repo}")
|
||||
result = subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
"--commit-message",
|
||||
"Update README.md",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.debug(f"DEBUG: README upload completed with return code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"❌ Failed to upload README to {output_repo}")
|
||||
logger.error(f"Return code: {e.returncode}")
|
||||
if e.stderr:
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
msg = f"README upload failed: {e}"
|
||||
raise RuntimeError(msg) from e
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error during README upload: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
logger.info("README uploaded")
|
||||
|
||||
def _ensure_repo_exists(self, repo_id: str) -> None:
|
||||
"""Ensure the repository exists, creating it if necessary."""
|
||||
try:
|
||||
# Try to create the repo - will fail if it already exists
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"repo",
|
||||
"create",
|
||||
repo_id,
|
||||
"--type",
|
||||
"model",
|
||||
"-y",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.info(f"Created repository: {repo_id}")
|
||||
except subprocess.CalledProcessError:
|
||||
# Repository already exists, that's fine
|
||||
pass
|
||||
|
||||
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
|
||||
"""Upload model file to repository.
|
||||
|
||||
Uploads GGUF model file to specified repository path.
|
||||
Always uses huggingface-cli to ensure proper handling of large files
|
||||
via HuggingFace's xet backend.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the model file upload fails.
|
||||
"""
|
||||
logger.info(f"Uploading {model_path.name}...")
|
||||
|
||||
# Always use huggingface-cli for model files to ensure xet backend is used
|
||||
try:
|
||||
logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}")
|
||||
result = subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(model_path),
|
||||
model_path.name,
|
||||
"--revision",
|
||||
"main", # Explicitly push to main branch
|
||||
"--commit-message",
|
||||
f"Add {model_path.name}",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
logger.debug(f"DEBUG: Model upload completed with return code {result.returncode}")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"❌ Failed to upload model file {model_path.name} to {output_repo}")
|
||||
logger.error(f"Return code: {e.returncode}")
|
||||
if e.stderr:
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
if e.stdout:
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
msg = f"Model file upload failed: {e}"
|
||||
raise RuntimeError(msg) from e
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Unexpected error during model file upload: {e}")
|
||||
logger.error("Exception traceback:")
|
||||
for line in traceback.format_exc().splitlines():
|
||||
logger.error(f" {line}")
|
||||
raise
|
||||
|
||||
# Extract and log the URL if present in output
|
||||
if result.stdout:
|
||||
for line in result.stdout.splitlines():
|
||||
if "https://huggingface.co/" in line:
|
||||
logger.info(f"Upload URL: {line.strip()}")
|
||||
break
|
||||
|
||||
logger.info(f"{model_path.name} uploaded")
|
||||
|
||||
def _try_git_upload_file(
|
||||
self,
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str,
|
||||
*,
|
||||
create_repo: bool = False,
|
||||
) -> bool:
|
||||
"""Try to upload file using git directly to avoid PR creation.
|
||||
|
||||
Returns:
|
||||
bool: True if upload successful, False if should fallback to CLI.
|
||||
"""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_path = Path(temp_dir)
|
||||
repo_url = f"https://huggingface.co/{repo_id}"
|
||||
|
||||
# Clone repository
|
||||
logger.info(f"Cloning {repo_url}...")
|
||||
result = subprocess.run(
|
||||
["git", "clone", repo_url, str(temp_path / "repo")],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
if create_repo:
|
||||
# Repository doesn't exist, let huggingface-cli handle creation
|
||||
return False
|
||||
logger.warning(f"Clone failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
repo_dir = temp_path / "repo"
|
||||
target_file = repo_dir / repo_path
|
||||
|
||||
# Ensure target directory exists
|
||||
target_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy file
|
||||
shutil.copy2(local_path, target_file)
|
||||
|
||||
# Check if there are any changes
|
||||
status_result = subprocess.run(
|
||||
["git", "status", "--porcelain"],
|
||||
cwd=repo_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
if not status_result.stdout.strip():
|
||||
logger.info(f"No changes detected for {repo_path}, file already up-to-date")
|
||||
return True # File is already up-to-date, no need to push
|
||||
|
||||
# Git add, commit, push
|
||||
subprocess.run(
|
||||
["git", "add", repo_path],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", f"Update {repo_path}"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["git", "push"],
|
||||
cwd=repo_dir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Git upload failed: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Git upload error: {e}")
|
||||
return False
|
|
@ -9,8 +9,8 @@ from __future__ import annotations
|
|||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from helpers.filesystem import FilesystemService
|
||||
from helpers.models.conversion import GGUFParameters, ModelConfig, VisionConfig
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
@ -107,28 +107,51 @@ class ConfigParser:
|
|||
|
||||
@staticmethod
|
||||
def get_architecture_mapping(architecture: str) -> str:
|
||||
"""Map architecture names to known GGUF architectures.
|
||||
"""Get the GGUF architecture name for a model.
|
||||
|
||||
Provides fallback mappings for architectures not directly supported
|
||||
by GGUF format, translating them to similar known architectures. This
|
||||
enables broader model compatibility whilst maintaining GGUF standards.
|
||||
Returns the original architecture name to preserve model identity.
|
||||
Only maps architectures that are truly compatible.
|
||||
|
||||
Returns:
|
||||
GGUF-compatible architecture name with appropriate fallback to llama.
|
||||
Architecture name for GGUF, preserving original when possible.
|
||||
"""
|
||||
# Architecture mappings to known GGUF types
|
||||
mappings = {
|
||||
"DotsOCRForCausalLM": "qwen2", # Similar architecture
|
||||
"GptOssForCausalLM": "llama", # Use llama as fallback
|
||||
"MistralForCausalLM": "llama", # Mistral is llama-like
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
# Only map architectures that are ACTUALLY the same
|
||||
# DO NOT map incompatible architectures
|
||||
known_compatible = {
|
||||
"LlamaForCausalLM": "llama",
|
||||
"MistralForCausalLM": "llama",
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"GptOssForCausalLM": "gptoss",
|
||||
"Phi3ForCausalLM": "phi3",
|
||||
# Add more mappings as needed
|
||||
"FalconForCausalLM": "falcon",
|
||||
"GPT2LMHeadModel": "gpt2",
|
||||
"GPTJForCausalLM": "gptj",
|
||||
"GPTNeoXForCausalLM": "gptneox",
|
||||
"MPTForCausalLM": "mpt",
|
||||
"BaichuanForCausalLM": "baichuan",
|
||||
"StableLMEpochForCausalLM": "stablelm",
|
||||
}
|
||||
|
||||
return mappings.get(architecture, "llama") # Default to llama
|
||||
if architecture in known_compatible:
|
||||
return known_compatible[architecture]
|
||||
|
||||
# For unknown architectures, preserve the original name
|
||||
# This will make it clear the model needs proper support
|
||||
# Remove common suffixes to get cleaner architecture name
|
||||
arch_name = architecture
|
||||
for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]:
|
||||
if arch_name.endswith(suffix):
|
||||
arch_name = arch_name[: -len(suffix)]
|
||||
break
|
||||
|
||||
arch_name = arch_name.lower()
|
||||
|
||||
# Special case: convert "gpt-oss" to "gptoss"
|
||||
if arch_name == "gpt-oss":
|
||||
arch_name = "gptoss"
|
||||
|
||||
return arch_name
|
||||
|
||||
@staticmethod
|
||||
def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
|
||||
|
@ -155,11 +178,33 @@ class ConfigParser:
|
|||
|
||||
config = fs.load_json_config(tokeniser_config_path)
|
||||
|
||||
# Extract token IDs with defaults
|
||||
# Try to find special token IDs from added_tokens_decoder
|
||||
added_tokens = config.get("added_tokens_decoder", {})
|
||||
eos_token_id = config.get("eos_token_id")
|
||||
bos_token_id = config.get("bos_token_id")
|
||||
|
||||
# If not directly specified, search in added_tokens_decoder
|
||||
if eos_token_id is None:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
if token_info.get("content") == "<|endoftext|>":
|
||||
eos_token_id = int(token_id)
|
||||
break
|
||||
|
||||
if bos_token_id is None:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
if token_info.get("content") in {"<|im_start|>", "<s>", "<|startoftext|>"}:
|
||||
bos_token_id = int(token_id)
|
||||
break
|
||||
|
||||
# Extract token IDs with better defaults
|
||||
return {
|
||||
"bos_token_id": config.get("bos_token_id", 1),
|
||||
"eos_token_id": config.get("eos_token_id", 2),
|
||||
"bos_token_id": bos_token_id if bos_token_id is not None else 1,
|
||||
"eos_token_id": eos_token_id if eos_token_id is not None else 2,
|
||||
"unk_token_id": config.get("unk_token_id", 0),
|
||||
"pad_token_id": config.get("pad_token_id", 0),
|
||||
"pad_token_id": config.get(
|
||||
"pad_token_id", eos_token_id if eos_token_id is not None else 0
|
||||
),
|
||||
"model_type": config.get("model_type", "llama"),
|
||||
"add_bos_token": config.get("add_bos_token", True),
|
||||
"add_eos_token": config.get("add_eos_token", False),
|
||||
}
|
||||
|
|
127
helpers/utils/rate_limiter.py
Normal file
127
helpers/utils/rate_limiter.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
"""Rate limiter for README updates.
|
||||
|
||||
Implements a cooldown mechanism to prevent excessive HuggingFace API calls
|
||||
while ensuring all updates eventually reach the repository.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
|
||||
class ReadmeRateLimiter:
|
||||
"""Rate limits README updates to prevent API throttling.
|
||||
|
||||
Ensures updates are batched with a minimum interval between API calls,
|
||||
while guaranteeing that pending updates are eventually applied.
|
||||
"""
|
||||
|
||||
def __init__(self, cooldown_seconds: float = 30.0) -> None:
|
||||
"""Initialise rate limiter with specified cooldown period.
|
||||
|
||||
Sets up the rate limiter with the specified cooldown interval to
|
||||
prevent excessive API calls whilst ensuring pending updates are
|
||||
eventually processed through a timer-based batching mechanism.
|
||||
"""
|
||||
self.cooldown_seconds = cooldown_seconds
|
||||
self.last_update_time = 0.0
|
||||
self.pending_update = False
|
||||
self.update_lock = threading.Lock()
|
||||
self.timer: threading.Timer | None = None
|
||||
self.update_func: Callable[..., Any] | None = None
|
||||
self.update_args: tuple[Any, ...] | None = None
|
||||
self.update_kwargs: dict[str, Any] | None = None
|
||||
|
||||
def request_update(
|
||||
self,
|
||||
update_func: Callable[..., Any],
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Request a README update, respecting rate limits.
|
||||
|
||||
Updates are batched during cooldown periods and executed
|
||||
when the cooldown expires. Stores the update function and its
|
||||
arguments for deferred execution whilst maintaining thread safety.
|
||||
"""
|
||||
with self.update_lock:
|
||||
current_time = time.time()
|
||||
time_since_last = current_time - self.last_update_time
|
||||
|
||||
# Store the latest update request
|
||||
self.update_func = update_func
|
||||
self.update_args = args
|
||||
self.update_kwargs = kwargs
|
||||
|
||||
if time_since_last >= self.cooldown_seconds:
|
||||
# Enough time has passed, update immediately
|
||||
logger.debug(f"README update allowed (last update {time_since_last:.1f}s ago)")
|
||||
self._execute_update()
|
||||
else:
|
||||
# Still in cooldown, schedule for later
|
||||
remaining = self.cooldown_seconds - time_since_last
|
||||
logger.debug(f"README update delayed ({remaining:.1f}s cooldown remaining)")
|
||||
|
||||
if not self.pending_update:
|
||||
# Schedule an update when cooldown expires
|
||||
self.pending_update = True
|
||||
if self.timer:
|
||||
self.timer.cancel()
|
||||
self.timer = threading.Timer(remaining, self._delayed_update)
|
||||
self.timer.start()
|
||||
else:
|
||||
# Update already scheduled, just update the args
|
||||
logger.debug("README update already scheduled, updating with latest data")
|
||||
|
||||
def _execute_update(self) -> None:
|
||||
"""Execute the actual update (must be called with lock held)."""
|
||||
if self.update_func:
|
||||
try:
|
||||
args = self.update_args or ()
|
||||
kwargs = self.update_kwargs or {}
|
||||
self.update_func(*args, **kwargs)
|
||||
self.last_update_time = time.time()
|
||||
logger.debug("README update completed")
|
||||
except Exception as e:
|
||||
logger.error(f"README update failed: {e}")
|
||||
|
||||
self.pending_update = False
|
||||
self.update_func = None
|
||||
self.update_args = None
|
||||
self.update_kwargs = None
|
||||
|
||||
def _delayed_update(self) -> None:
|
||||
"""Execute a delayed update after cooldown expires."""
|
||||
with self.update_lock:
|
||||
if self.pending_update:
|
||||
logger.debug("Executing delayed README update")
|
||||
self._execute_update()
|
||||
|
||||
def flush(self) -> None:
|
||||
"""Force any pending updates to execute immediately.
|
||||
|
||||
Called at script end to ensure final state is uploaded.
|
||||
"""
|
||||
with self.update_lock:
|
||||
if self.timer:
|
||||
self.timer.cancel()
|
||||
self.timer = None
|
||||
|
||||
if self.pending_update and self.update_func:
|
||||
logger.info("Flushing pending README update...")
|
||||
# Wait for cooldown if needed
|
||||
current_time = time.time()
|
||||
time_since_last = current_time - self.last_update_time
|
||||
if time_since_last < self.cooldown_seconds:
|
||||
wait_time = self.cooldown_seconds - time_since_last
|
||||
logger.info(f"Waiting {wait_time:.1f}s for cooldown before final update...")
|
||||
time.sleep(wait_time)
|
||||
|
||||
self._execute_update()
|
|
@ -70,6 +70,8 @@ skip-magic-trailing-comma = false
|
|||
[tool.ruff.lint]
|
||||
fixable = ["ALL"]
|
||||
ignore = [
|
||||
"ANN002", # type annotation for args
|
||||
"ANN003", # type annotation for kwargs
|
||||
"ANN401", # use of Any type
|
||||
"BLE001", # blind Exception usage
|
||||
"COM812", # missing trailing comma
|
||||
|
|
|
@ -17,7 +17,7 @@ import sys
|
|||
from pathlib import Path
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.orchestrator import QuantisationOrchestrator
|
||||
from helpers.quantisation import QuantisationOrchestrator
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
|
@ -12,8 +12,8 @@ import traceback
|
|||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.gguf import GGUFConverter
|
||||
from helpers.logger import logger
|
||||
from helpers.services.gguf import GGUFConverter
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
|
|
40
uv.lock
generated
40
uv.lock
generated
|
@ -496,26 +496,26 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "uv"
|
||||
version = "0.8.6"
|
||||
version = "0.8.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" },
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue