Initial commit
This commit is contained in:
commit
ef7df1a8c3
28 changed files with 6829 additions and 0 deletions
6
helpers/__init__.py
Normal file
6
helpers/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
"""Helper utilities for LLM GGUF tools.
|
||||
|
||||
This package provides common utilities, logging, and shared functionality
|
||||
used across the quantisation and conversion tools. Uses UK English spelling
|
||||
conventions throughout.
|
||||
"""
|
6
helpers/config/__init__.py
Normal file
6
helpers/config/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
"""Configuration module for quantisation settings and tensor-level precision control.
|
||||
|
||||
Provides structured configuration definitions for Bartowski quantisation methods
|
||||
including Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with fallback strategies
|
||||
for different model architectures and deployment scenarios.
|
||||
"""
|
95
helpers/config/quantisation_configs.py
Normal file
95
helpers/config/quantisation_configs.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
"""Quantisation configuration definitions.
|
||||
|
||||
Pre-defined quantisation configurations for the Bartowski method, supporting
|
||||
Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with tensor-level precision control.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.models.quantisation import QuantisationConfig, QuantisationType
|
||||
|
||||
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
||||
QuantisationType.Q4_K_M: QuantisationConfig(
|
||||
name="Q4_K_M",
|
||||
description="Standard Q4_K_M quantisation (baseline)",
|
||||
tensor_types={}, # No special tensor overrides - uses default Q4_K_M
|
||||
fallback_methods=[],
|
||||
),
|
||||
QuantisationType.Q4_K_L: QuantisationConfig(
|
||||
name="Q4_K_L",
|
||||
description="Q6_K embeddings + Q6_K attention (+753MB for vocab + reasoning)",
|
||||
tensor_types={
|
||||
"token_embd.weight": "Q6_K",
|
||||
"output.weight": "Q6_K",
|
||||
"lm_head.weight": "Q6_K",
|
||||
"blk.*.attn_q.weight": "Q6_K",
|
||||
"blk.*.attn_k.weight": "Q6_K",
|
||||
"blk.*.attn_v.weight": "Q6_K",
|
||||
},
|
||||
fallback_methods=[
|
||||
{
|
||||
"embed_tokens.weight": "Q6_K",
|
||||
"output.weight": "Q6_K",
|
||||
"lm_head.weight": "Q6_K",
|
||||
"blk.*.attn_q.weight": "Q6_K",
|
||||
"blk.*.attn_k.weight": "Q6_K",
|
||||
"blk.*.attn_v.weight": "Q6_K",
|
||||
},
|
||||
{"token-embedding-type": "Q6_K", "output-tensor-type": "Q6_K"},
|
||||
],
|
||||
),
|
||||
QuantisationType.Q4_K_XL: QuantisationConfig(
|
||||
name="Q4_K_XL",
|
||||
description="Q8_0 embeddings + Q6_K attention (+2.1GB for vocabulary + reasoning)",
|
||||
tensor_types={
|
||||
"token_embd.weight": "Q8_0",
|
||||
"output.weight": "Q8_0",
|
||||
"lm_head.weight": "Q8_0",
|
||||
"blk.*.attn_q.weight": "Q6_K",
|
||||
"blk.*.attn_k.weight": "Q6_K",
|
||||
"blk.*.attn_v.weight": "Q6_K",
|
||||
},
|
||||
fallback_methods=[
|
||||
{
|
||||
"embed_tokens.weight": "Q8_0",
|
||||
"output.weight": "Q8_0",
|
||||
"lm_head.weight": "Q8_0",
|
||||
"blk.*.attn_q.weight": "Q6_K",
|
||||
"blk.*.attn_k.weight": "Q6_K",
|
||||
"blk.*.attn_v.weight": "Q6_K",
|
||||
},
|
||||
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
|
||||
],
|
||||
),
|
||||
QuantisationType.Q4_K_XXL: QuantisationConfig(
|
||||
name="Q4_K_XXL",
|
||||
description="Q8_0 embeddings + Q8_0 attention (+2.8GB total, maximum precision)",
|
||||
tensor_types={
|
||||
"token_embd.weight": "Q8_0",
|
||||
"output.weight": "Q8_0",
|
||||
"lm_head.weight": "Q8_0",
|
||||
"blk.*.attn_q.weight": "Q8_0",
|
||||
"blk.*.attn_k.weight": "Q8_0",
|
||||
"blk.*.attn_v.weight": "Q8_0",
|
||||
},
|
||||
fallback_methods=[
|
||||
{
|
||||
"embed_tokens.weight": "Q8_0",
|
||||
"output.weight": "Q8_0",
|
||||
"lm_head.weight": "Q8_0",
|
||||
"blk.*.attn_q.weight": "Q8_0",
|
||||
"blk.*.attn_k.weight": "Q8_0",
|
||||
"blk.*.attn_v.weight": "Q8_0",
|
||||
},
|
||||
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
|
||||
],
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
QuantisationType.Q4_K_XL,
|
||||
QuantisationType.Q4_K_XXL,
|
||||
]
|
94
helpers/logger.py
Normal file
94
helpers/logger.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
"""Colour-coded logging configuration for LLM GGUF tools.
|
||||
|
||||
Provides a consistent logging interface with colour-coded output for different
|
||||
log levels, making it easier to identify warnings, errors, and informational
|
||||
messages at a glance during tool execution and debugging sessions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from logging import (
|
||||
CRITICAL,
|
||||
DEBUG,
|
||||
ERROR,
|
||||
INFO,
|
||||
WARNING,
|
||||
Formatter as LoggingFormatter,
|
||||
Logger,
|
||||
LogRecord,
|
||||
StreamHandler as LoggingStreamHandler,
|
||||
getLogger,
|
||||
)
|
||||
from os import getenv as os_getenv
|
||||
from sys import stdout as sys_stdout
|
||||
from typing import ClassVar
|
||||
|
||||
DEBUG_MODE = os_getenv("DEBUG", "false").lower() == "true"
|
||||
|
||||
|
||||
class ColourFormatter(LoggingFormatter):
|
||||
"""Custom formatter adding colours to log messages based on severity level.
|
||||
|
||||
Uses ANSI escape codes to provide visual distinction between different
|
||||
log levels in terminal output. Supports standard logging levels with
|
||||
appropriate colour coding: DEBUG (cyan), INFO (green), WARNING (yellow),
|
||||
ERROR (red), and CRITICAL (bold red) for immediate visual feedback.
|
||||
"""
|
||||
|
||||
# ANSI colour codes
|
||||
COLOURS: ClassVar[dict[int, str]] = {
|
||||
DEBUG: "\033[36m", # Cyan
|
||||
INFO: "\033[32m", # Green
|
||||
WARNING: "\033[33m", # Yellow
|
||||
ERROR: "\033[31m", # Red
|
||||
CRITICAL: "\033[1;31m", # Bold Red
|
||||
}
|
||||
RESET = "\033[0m"
|
||||
|
||||
# Emoji prefixes for different levels
|
||||
EMOJIS: ClassVar[dict[int, str]] = {
|
||||
DEBUG: "🔍",
|
||||
INFO: "ℹ️ ", # noqa: RUF001
|
||||
WARNING: "⚠️ ",
|
||||
ERROR: "❌",
|
||||
CRITICAL: "🔥",
|
||||
}
|
||||
|
||||
def format(self, record: LogRecord) -> str:
|
||||
"""Format log record with colour and emoji based on severity level.
|
||||
|
||||
Enhances standard log formatting by prepending ANSI colour codes and
|
||||
emoji indicators, then appending reset codes to prevent colour bleeding.
|
||||
Maintains standard log structure whilst adding visual enhancements for
|
||||
improved readability in terminal environments.
|
||||
|
||||
Returns:
|
||||
str: Formatted log message with colour and emoji.
|
||||
"""
|
||||
# Get colour for this level
|
||||
colour = self.COLOURS.get(record.levelno, "")
|
||||
emoji = self.EMOJIS.get(record.levelno, "")
|
||||
|
||||
# Format the message
|
||||
record.msg = f"{emoji} {record.msg}"
|
||||
formatted = super().format(record)
|
||||
|
||||
# Add colour codes
|
||||
return f"{colour}{formatted}{self.RESET}"
|
||||
|
||||
|
||||
# Create and configure the logger
|
||||
logger: Logger = getLogger("llm-gguf-tools")
|
||||
logger.setLevel(DEBUG if DEBUG_MODE else INFO)
|
||||
|
||||
# Create console handler with colour formatter
|
||||
handler = LoggingStreamHandler(sys_stdout)
|
||||
handler.setLevel(DEBUG if DEBUG_MODE else INFO)
|
||||
|
||||
# Set formatter without timestamp for cleaner output
|
||||
formatter = ColourFormatter(fmt="%(message)s", datefmt="%H:%M:%S")
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
# Prevent propagation to root logger
|
||||
logger.propagate = False
|
35
helpers/models/__init__.py
Normal file
35
helpers/models/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""Pydantic models for llm-gguf-tools.
|
||||
|
||||
This module provides structured data models for quantisation and conversion
|
||||
operations, ensuring type safety and validation across the toolset.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.models.conversion import (
|
||||
GGUFParameters,
|
||||
ModelConfig,
|
||||
TensorMapping,
|
||||
VisionConfig,
|
||||
)
|
||||
from helpers.models.quantisation import (
|
||||
LlamaCppEnvironment,
|
||||
ModelSource,
|
||||
QuantisationConfig,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
URLType,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"GGUFParameters",
|
||||
"LlamaCppEnvironment",
|
||||
"ModelConfig",
|
||||
"ModelSource",
|
||||
"QuantisationConfig",
|
||||
"QuantisationResult",
|
||||
"QuantisationType",
|
||||
"TensorMapping",
|
||||
"URLType",
|
||||
"VisionConfig",
|
||||
]
|
150
helpers/models/conversion.py
Normal file
150
helpers/models/conversion.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
"""Pydantic models for GGUF conversion operations.
|
||||
|
||||
Contains data models for SafeTensors to GGUF conversion including
|
||||
model configurations, parameter mappings, and tensor specifications.
|
||||
Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class ModelConfig(BaseModel):
|
||||
"""Parsed model configuration from HuggingFace config.json.
|
||||
|
||||
Represents the standard configuration metadata extracted from HuggingFace
|
||||
models, providing structured access to architecture details, hyperparameters,
|
||||
and quantisation settings required for GGUF conversion.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
architectures: list[str] = Field(default_factory=lambda: ["Unknown"])
|
||||
model_type: str = "unknown"
|
||||
vocab_size: int = 32000
|
||||
max_position_embeddings: int = 2048
|
||||
hidden_size: int = 4096
|
||||
num_hidden_layers: int = 32
|
||||
intermediate_size: int = 11008
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int | None = None
|
||||
rope_theta: float = 10000.0
|
||||
rope_scaling: dict[str, Any] | None = None
|
||||
rms_norm_eps: float = 1e-5
|
||||
vision_config: VisionConfig | None = None
|
||||
|
||||
def to_gguf_params(self) -> GGUFParameters:
|
||||
"""Convert model configuration to GGUF parameters.
|
||||
|
||||
Translates HuggingFace model configuration values to GGUF-specific
|
||||
parameter format, handling defaults and calculating derived values
|
||||
like RoPE dimension count from head dimensions.
|
||||
|
||||
Returns:
|
||||
GGUFParameters instance with converted values.
|
||||
"""
|
||||
params = {
|
||||
"vocab_size": self.vocab_size,
|
||||
"context_length": self.max_position_embeddings,
|
||||
"embedding_length": self.hidden_size,
|
||||
"block_count": self.num_hidden_layers,
|
||||
"feed_forward_length": self.intermediate_size,
|
||||
"attention.head_count": self.num_attention_heads,
|
||||
"attention.head_count_kv": self.num_key_value_heads or self.num_attention_heads,
|
||||
"attention.layer_norm_rms_epsilon": self.rms_norm_eps,
|
||||
"rope.freq_base": self.rope_theta,
|
||||
"rope.dimension_count": self.hidden_size // self.num_attention_heads,
|
||||
}
|
||||
return GGUFParameters(**params) # type: ignore[arg-type]
|
||||
|
||||
|
||||
class VisionConfig(BaseModel):
|
||||
"""Vision model configuration for multimodal models.
|
||||
|
||||
Contains parameters specific to vision components in multimodal architectures,
|
||||
including patch sizes, embedding dimensions, and spatial merge configurations
|
||||
for proper GGUF metadata generation.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
hidden_size: int = 1536
|
||||
num_hidden_layers: int = 42
|
||||
num_attention_heads: int = 12
|
||||
intermediate_size: int = 4224
|
||||
patch_size: int = 14
|
||||
spatial_merge_size: int = 2
|
||||
rms_norm_eps: float | None = None
|
||||
|
||||
|
||||
class GGUFParameters(BaseModel):
|
||||
"""GGUF-specific parameters inferred from model configuration.
|
||||
|
||||
Translates HuggingFace configuration values to GGUF parameter names and
|
||||
formats, providing a standardised interface for GGUF writer configuration
|
||||
across different model architectures and quantisation strategies.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
|
||||
# Basic parameters
|
||||
vocab_size: int
|
||||
context_length: int
|
||||
embedding_length: int
|
||||
block_count: int
|
||||
feed_forward_length: int
|
||||
|
||||
# Attention parameters
|
||||
attention_head_count: int = Field(alias="attention.head_count")
|
||||
attention_head_count_kv: int = Field(alias="attention.head_count_kv")
|
||||
attention_layer_norm_rms_epsilon: float = Field(alias="attention.layer_norm_rms_epsilon")
|
||||
|
||||
# RoPE parameters
|
||||
rope_freq_base: float = Field(alias="rope.freq_base")
|
||||
rope_dimension_count: int = Field(alias="rope.dimension_count")
|
||||
rope_scaling_type: str | None = Field(default=None, alias="rope.scaling.type")
|
||||
rope_scaling_factor: float | None = Field(default=None, alias="rope.scaling.factor")
|
||||
|
||||
|
||||
class TensorMapping(BaseModel):
|
||||
"""Mapping configuration for tensor name conversion.
|
||||
|
||||
Defines rules for translating between HuggingFace tensor naming conventions
|
||||
and GGUF tensor names, supporting both direct mappings and pattern-based
|
||||
transformations for layer-specific tensors.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
# Direct mappings (exact name matches)
|
||||
direct_mappings: dict[str, str] = Field(
|
||||
default_factory=lambda: {
|
||||
"model.embed_tokens.weight": "token_embd.weight",
|
||||
"model.norm.weight": "output_norm.weight",
|
||||
"lm_head.weight": "output.weight",
|
||||
}
|
||||
)
|
||||
|
||||
# Layer component patterns (for .layers.N. tensors)
|
||||
layer_patterns: dict[str, str] = Field(
|
||||
default_factory=lambda: {
|
||||
"self_attn.q_proj.weight": "attn_q.weight",
|
||||
"self_attn.q_proj.bias": "attn_q.bias",
|
||||
"self_attn.k_proj.weight": "attn_k.weight",
|
||||
"self_attn.k_proj.bias": "attn_k.bias",
|
||||
"self_attn.v_proj.weight": "attn_v.weight",
|
||||
"self_attn.v_proj.bias": "attn_v.bias",
|
||||
"self_attn.o_proj": "attn_output.weight",
|
||||
"mlp.gate_proj": "ffn_gate.weight",
|
||||
"mlp.up_proj": "ffn_up.weight",
|
||||
"mlp.down_proj": "ffn_down.weight",
|
||||
"input_layernorm": "attn_norm.weight",
|
||||
"post_attention_layernorm": "ffn_norm.weight",
|
||||
}
|
||||
)
|
||||
|
||||
# Architecture-specific overrides
|
||||
architecture_overrides: dict[str, dict[str, str]] = Field(default_factory=dict)
|
168
helpers/models/quantisation.py
Normal file
168
helpers/models/quantisation.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
"""Pydantic models for quantisation operations.
|
||||
|
||||
Contains data models specific to the quantisation workflow including
|
||||
quantisation types, configurations, and results. Uses UK English spelling
|
||||
conventions throughout (quantisation, not quantization).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class QuantisationType(StrEnum):
|
||||
"""Available quantisation types for Bartowski-method GGUF model conversion.
|
||||
|
||||
Defines the specific quantisation strategies supported by this tool, ranging
|
||||
from Q4_K_M baseline to Q4_K_XXL maximum precision variants. Each type
|
||||
represents different trade-offs between model size and quality preservation
|
||||
for embeddings, attention layers, and feed-forward networks.
|
||||
"""
|
||||
|
||||
Q4_K_M = "Q4_K_M"
|
||||
Q4_K_L = "Q4_K_L"
|
||||
Q4_K_XL = "Q4_K_XL"
|
||||
Q4_K_XXL = "Q4_K_XXL"
|
||||
|
||||
|
||||
class URLType(StrEnum):
|
||||
"""Supported URL formats for model source specification.
|
||||
|
||||
Categorises input URL formats to enable appropriate handling strategies.
|
||||
HuggingFace URLs require full model download and conversion, whilst Ollama
|
||||
GGUF URLs allow direct GGUF file downloads with pattern matching for
|
||||
efficient processing of pre-quantised models.
|
||||
"""
|
||||
|
||||
HUGGINGFACE = "huggingface"
|
||||
OLLAMA_GGUF = "ollama_gguf"
|
||||
|
||||
|
||||
class QuantisationConfig(BaseModel):
|
||||
"""Configuration for a specific quantisation method with tensor-level precision control.
|
||||
|
||||
Defines quantisation parameters including tensor type mappings and fallback
|
||||
methods for handling different model architectures. Enables fine-grained
|
||||
control over which layers receive higher precision treatment whilst
|
||||
maintaining compatibility across diverse model structures.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True)
|
||||
|
||||
name: str
|
||||
description: str
|
||||
tensor_types: dict[str, str] = Field(default_factory=dict)
|
||||
fallback_methods: list[dict[str, str]] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ModelSource(BaseModel):
|
||||
"""Represents a model source with parsed information from URL analysis.
|
||||
|
||||
Contains comprehensive metadata extracted from model URLs including source
|
||||
repository details, author information, and GGUF file patterns. Enables
|
||||
differentiation between regular HuggingFace repositories requiring conversion
|
||||
and GGUF repositories allowing direct file downloads.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True, protected_namespaces=())
|
||||
|
||||
url: str
|
||||
url_type: URLType
|
||||
source_model: str
|
||||
original_author: str
|
||||
model_name: str
|
||||
gguf_file_pattern: str | None = None
|
||||
is_gguf_repo: bool = False
|
||||
|
||||
@field_validator("url")
|
||||
@classmethod
|
||||
def validate_url(cls, v: str) -> str:
|
||||
"""Validate that URL is not empty.
|
||||
|
||||
Ensures the provided URL string is not empty or None,
|
||||
as this is required for model source identification.
|
||||
|
||||
Returns:
|
||||
The validated URL string.
|
||||
|
||||
Raises:
|
||||
ValueError: If URL is empty or None.
|
||||
"""
|
||||
if not v:
|
||||
msg = "URL cannot be empty"
|
||||
raise ValueError(msg)
|
||||
return v
|
||||
|
||||
|
||||
class QuantisationResult(BaseModel):
|
||||
"""Result of a quantisation operation with comprehensive status tracking.
|
||||
|
||||
Captures the outcome of individual quantisation attempts including success
|
||||
status, file paths, sizes, and error details. Supports workflow status
|
||||
tracking from planning through processing to completion, enabling real-time
|
||||
progress reporting and parallel upload coordination.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)
|
||||
|
||||
quantisation_type: QuantisationType
|
||||
success: bool
|
||||
file_path: Path | None = None
|
||||
file_size: str | None = None
|
||||
method_used: str | None = None
|
||||
error_message: str | None = None
|
||||
status: str = "pending" # planned, processing, uploading, completed, failed
|
||||
|
||||
|
||||
class LlamaCppEnvironment(BaseModel):
|
||||
"""Represents llama.cpp environment setup with binary and script locations.
|
||||
|
||||
Encapsulates the runtime environment for llama.cpp tools including paths
|
||||
to quantisation binaries, CLI tools, and conversion scripts. Handles both
|
||||
local binary installations and repository-based setups to provide flexible
|
||||
deployment options across different system configurations.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
quantise_binary: Path # UK spelling
|
||||
cli_binary: Path
|
||||
convert_script: str
|
||||
use_repo: bool = False
|
||||
|
||||
|
||||
class QuantisationContext(BaseModel):
|
||||
"""Context object containing all parameters needed for quantisation execution.
|
||||
|
||||
Encapsulates quantisation parameters to reduce method argument counts
|
||||
and improve code maintainability following parameter object pattern.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
f16_model_path: Path
|
||||
model_source: ModelSource
|
||||
config: QuantisationConfig
|
||||
llama_env: LlamaCppEnvironment
|
||||
models_dir: Path
|
||||
imatrix_path: Path | None = None
|
||||
base_quant: str = "Q4_K_M"
|
||||
|
||||
def get_output_path(self) -> Path:
|
||||
"""Generate output path for quantised model.
|
||||
|
||||
Returns:
|
||||
Path to the output GGUF file.
|
||||
"""
|
||||
output_filename = (
|
||||
f"{self.model_source.original_author}-"
|
||||
f"{self.model_source.model_name}-"
|
||||
f"{self.config.name}.gguf"
|
||||
)
|
||||
return self.models_dir / self.model_source.model_name / output_filename
|
20
helpers/services/__init__.py
Normal file
20
helpers/services/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
"""Service layer for llm-gguf-tools.
|
||||
|
||||
Provides high-level service interfaces for interacting with external systems
|
||||
including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.services.huggingface import HuggingFaceService, ReadmeGenerator
|
||||
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
|
||||
|
||||
__all__ = [
|
||||
"EnvironmentManager",
|
||||
"FilesystemService",
|
||||
"HuggingFaceService",
|
||||
"IMatrixGenerator",
|
||||
"ReadmeGenerator",
|
||||
]
|
174
helpers/services/filesystem.py
Normal file
174
helpers/services/filesystem.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
"""Filesystem operations service.
|
||||
|
||||
Provides unified filesystem operations including file discovery, size
|
||||
calculation, and path management. Consolidates common filesystem patterns
|
||||
used across quantisation and conversion workflows.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
BYTES_PER_UNIT = 1024.0
|
||||
|
||||
|
||||
class FilesystemService:
|
||||
"""Handles filesystem operations with consistent error handling.
|
||||
|
||||
Provides methods for file discovery, size formatting, and JSON loading
|
||||
with proper error handling and logging. Ensures consistent behaviour
|
||||
across different tools and workflows.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_file_size(file_path: Path) -> str:
|
||||
"""Get human-readable file size using system utilities.
|
||||
|
||||
Attempts to use `du -h` for human-readable output, falling back to
|
||||
Python calculation if the system command fails. Provides consistent
|
||||
size formatting across the toolset.
|
||||
|
||||
Returns:
|
||||
Human-readable file size string (e.g., "1.5G", "750M").
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["du", "-h", str(file_path)], capture_output=True, text=True, check=True
|
||||
)
|
||||
return result.stdout.split()[0]
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
# Fallback to Python calculation
|
||||
|
||||
try:
|
||||
size_bytes: float = float(file_path.stat().st_size)
|
||||
for unit in ["B", "K", "M", "G", "T"]:
|
||||
if size_bytes < BYTES_PER_UNIT:
|
||||
return f"{size_bytes:.1f}{unit}"
|
||||
size_bytes /= BYTES_PER_UNIT
|
||||
except Exception:
|
||||
return "Unknown"
|
||||
else:
|
||||
return f"{size_bytes:.1f}P"
|
||||
|
||||
@staticmethod
|
||||
def load_json_config(config_path: Path) -> dict[str, Any]:
|
||||
"""Load and parse JSON configuration file.
|
||||
|
||||
Provides consistent JSON loading with proper error handling and
|
||||
encoding specification. Used for loading model configurations,
|
||||
tokeniser settings, and other JSON-based metadata.
|
||||
|
||||
Returns:
|
||||
Parsed JSON content as dictionary.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist.
|
||||
"""
|
||||
if not config_path.exists():
|
||||
msg = f"Configuration file not found: {config_path}"
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
with Path(config_path).open(encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@staticmethod
|
||||
def find_safetensor_files(model_path: Path) -> list[Path]:
|
||||
"""Find all SafeTensor files in model directory using priority search.
|
||||
|
||||
Searches for tensor files in order of preference: single model.safetensors,
|
||||
sharded model-*-of-*.safetensors files, then any *.safetensors files. This
|
||||
approach handles both single-file and multi-shard model distributions whilst
|
||||
ensuring predictable file ordering for conversion consistency.
|
||||
|
||||
Returns:
|
||||
List of SafeTensor file paths in priority order.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If no SafeTensor files are found.
|
||||
"""
|
||||
# Check for single file
|
||||
single_file = model_path / "model.safetensors"
|
||||
if single_file.exists():
|
||||
return [single_file]
|
||||
|
||||
# Check for sharded files
|
||||
pattern = "model-*-of-*.safetensors"
|
||||
sharded_files = sorted(model_path.glob(pattern))
|
||||
if sharded_files:
|
||||
return sharded_files
|
||||
|
||||
# Check for any safetensor files
|
||||
any_files = sorted(model_path.glob("*.safetensors"))
|
||||
if any_files:
|
||||
return any_files
|
||||
|
||||
msg = f"No SafeTensor files found in {model_path}"
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
@staticmethod
|
||||
def find_gguf_files(model_path: Path, pattern: str | None = None) -> list[Path]:
|
||||
"""Find GGUF files in directory, optionally filtered by pattern.
|
||||
|
||||
Searches for GGUF files with optional pattern matching. Prioritises
|
||||
multi-part files (00001-of-*) over single files for proper handling
|
||||
of large models split across multiple files.
|
||||
|
||||
Returns:
|
||||
List of GGUF file paths, sorted with multi-part files first.
|
||||
"""
|
||||
if pattern:
|
||||
gguf_files = list(model_path.glob(f"*{pattern}*.gguf"))
|
||||
else:
|
||||
gguf_files = list(model_path.glob("*.gguf"))
|
||||
|
||||
# Sort to prioritise 00001-of-* files
|
||||
gguf_files.sort(
|
||||
key=lambda x: (
|
||||
"00001-of-" not in x.name, # False sorts before True
|
||||
x.name,
|
||||
)
|
||||
)
|
||||
|
||||
return gguf_files
|
||||
|
||||
@staticmethod
|
||||
def ensure_directory(path: Path) -> Path:
|
||||
"""Ensure directory exists, creating if necessary.
|
||||
|
||||
Creates directory and all parent directories if they don't exist.
|
||||
Returns the path for method chaining convenience.
|
||||
|
||||
Returns:
|
||||
The directory path.
|
||||
"""
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
@staticmethod
|
||||
def cleanup_directory(path: Path, pattern: str = "*") -> int:
|
||||
"""Remove files matching pattern from directory.
|
||||
|
||||
Safely removes files matching the specified glob pattern. Returns
|
||||
count of files removed for logging purposes.
|
||||
|
||||
Returns:
|
||||
Number of files removed.
|
||||
"""
|
||||
if not path.exists():
|
||||
return 0
|
||||
|
||||
files_removed = 0
|
||||
for file_path in path.glob(pattern):
|
||||
if file_path.is_file():
|
||||
try:
|
||||
file_path.unlink()
|
||||
files_removed += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to remove {file_path}: {e}")
|
||||
|
||||
return files_removed
|
210
helpers/services/gguf.py
Normal file
210
helpers/services/gguf.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
"""GGUF file operations service.
|
||||
|
||||
Provides unified interface for creating, writing, and manipulating GGUF files.
|
||||
Consolidates GGUF-specific operations from conversion and quantisation workflows.
|
||||
Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import gguf
|
||||
import torch
|
||||
from safetensors import safe_open
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from helpers.models.conversion import ModelConfig
|
||||
|
||||
|
||||
class GGUFWriter:
|
||||
"""Manages GGUF file creation and metadata writing.
|
||||
|
||||
Provides high-level interface for GGUF file operations including metadata
|
||||
configuration, tensor addition, and tokeniser integration. Encapsulates
|
||||
low-level GGUF library interactions for consistent error handling.
|
||||
"""
|
||||
|
||||
def __init__(self, output_path: Path, architecture: str) -> None:
|
||||
"""Initialise GGUF writer with output path and architecture.
|
||||
|
||||
Creates the underlying GGUF writer instance and prepares for metadata
|
||||
and tensor addition. Sets up the file structure for the specified
|
||||
model architecture.
|
||||
"""
|
||||
self.output_path = output_path
|
||||
self.architecture = architecture
|
||||
self.writer = gguf.GGUFWriter(str(output_path), architecture)
|
||||
logger.info(f"Created GGUF writer for {architecture} architecture")
|
||||
|
||||
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
|
||||
"""Add comprehensive metadata from model configuration.
|
||||
|
||||
Writes general model information, architectural parameters, and
|
||||
quantisation settings to the GGUF file header. Handles both standard
|
||||
and vision model configurations with appropriate parameter mapping.
|
||||
"""
|
||||
# General metadata
|
||||
self.writer.add_name(model_name)
|
||||
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
||||
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
||||
|
||||
# Model parameters from config
|
||||
params = model_config.to_gguf_params()
|
||||
self.writer.add_context_length(params.context_length)
|
||||
self.writer.add_embedding_length(params.embedding_length)
|
||||
self.writer.add_block_count(params.block_count)
|
||||
self.writer.add_feed_forward_length(params.feed_forward_length)
|
||||
self.writer.add_head_count(params.attention_head_count)
|
||||
self.writer.add_head_count_kv(params.attention_head_count_kv)
|
||||
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
|
||||
self.writer.add_rope_freq_base(params.rope_freq_base)
|
||||
self.writer.add_rope_dimension_count(params.rope_dimension_count)
|
||||
|
||||
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
|
||||
|
||||
def add_vision_metadata(self, vision_config: Any) -> None:
|
||||
"""Add vision model parameters to GGUF metadata.
|
||||
|
||||
Configures vision-specific parameters for multimodal models including
|
||||
embedding dimensions, attention heads, and spatial processing settings.
|
||||
"""
|
||||
if not vision_config:
|
||||
return
|
||||
|
||||
logger.info("Adding vision model parameters...")
|
||||
self.writer.add_vision_embedding_length(vision_config.hidden_size)
|
||||
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
|
||||
self.writer.add_vision_head_count(vision_config.num_attention_heads)
|
||||
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
|
||||
self.writer.add_vision_patch_size(vision_config.patch_size)
|
||||
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
|
||||
|
||||
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
|
||||
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
|
||||
|
||||
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
|
||||
"""Add tokeniser metadata to GGUF file.
|
||||
|
||||
Writes special token IDs and tokeniser model type to enable proper
|
||||
text processing during inference. Uses sensible defaults for missing
|
||||
configuration values.
|
||||
"""
|
||||
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
|
||||
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
||||
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
||||
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
||||
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
|
||||
|
||||
logger.info("Added tokeniser configuration")
|
||||
|
||||
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
||||
"""Add a tensor to the GGUF file.
|
||||
|
||||
Writes tensor data with the specified name to the file. Handles
|
||||
data type conversions and validates tensor shapes.
|
||||
"""
|
||||
self.writer.add_tensor(name, data)
|
||||
|
||||
def finalise(self) -> None:
|
||||
"""Write all data to file and close writer.
|
||||
|
||||
Completes the GGUF file creation by writing headers, key-value data,
|
||||
and tensor data in the correct order. Ensures proper file closure.
|
||||
"""
|
||||
logger.info(f"Writing GGUF file to {self.output_path}")
|
||||
self.writer.write_header_to_file()
|
||||
self.writer.write_kv_data_to_file()
|
||||
self.writer.write_tensors_to_file()
|
||||
self.writer.close()
|
||||
logger.info("GGUF file written successfully")
|
||||
|
||||
|
||||
class GGUFConverter:
|
||||
"""High-level GGUF conversion orchestrator.
|
||||
|
||||
Coordinates the complete conversion workflow from source models to GGUF
|
||||
format, managing metadata extraction, tensor mapping, and file writing.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def convert_safetensors(
|
||||
model_path: Path,
|
||||
output_path: Path,
|
||||
model_config: ModelConfig,
|
||||
architecture: str,
|
||||
tensor_mapper: Any,
|
||||
) -> bool:
|
||||
"""Convert SafeTensors model to GGUF format.
|
||||
|
||||
Orchestrates the conversion process including metadata setup, tensor
|
||||
loading with BFloat16 support, name mapping, and tokeniser integration.
|
||||
|
||||
Returns:
|
||||
True if conversion successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"Converting {model_path.name} to GGUF...")
|
||||
|
||||
# Create writer
|
||||
writer_wrapper = GGUFWriter(output_path, architecture)
|
||||
|
||||
# Add metadata
|
||||
writer_wrapper.add_metadata(model_config, model_path.name)
|
||||
|
||||
# Add vision metadata if present
|
||||
if model_config.vision_config:
|
||||
writer_wrapper.add_vision_metadata(model_config.vision_config)
|
||||
|
||||
# Load and add tensors
|
||||
fs = FilesystemService()
|
||||
tensor_files = fs.find_safetensor_files(model_path)
|
||||
logger.info(f"Found {len(tensor_files)} tensor file(s)")
|
||||
|
||||
tensor_count = 0
|
||||
for tensor_file in tensor_files:
|
||||
logger.info(f"Loading {tensor_file.name}...")
|
||||
with safe_open(tensor_file, framework="pt") as f:
|
||||
for tensor_name in f:
|
||||
tensor_data = f.get_tensor(tensor_name)
|
||||
|
||||
# Convert BFloat16 to Float32
|
||||
if hasattr(tensor_data, "numpy"):
|
||||
if torch and tensor_data.dtype == torch.bfloat16:
|
||||
tensor_data = tensor_data.float()
|
||||
tensor_data = tensor_data.numpy()
|
||||
|
||||
# Map tensor name
|
||||
gguf_name = tensor_mapper.map_tensor_name(tensor_name)
|
||||
|
||||
if gguf_name:
|
||||
writer_wrapper.add_tensor(gguf_name, tensor_data)
|
||||
tensor_count += 1
|
||||
|
||||
if tensor_count % 100 == 0:
|
||||
logger.info(f" Processed {tensor_count} tensors...")
|
||||
|
||||
logger.info(f"Total tensors processed: {tensor_count}")
|
||||
|
||||
# Add tokeniser
|
||||
try:
|
||||
tok_config = ConfigParser.load_tokeniser_config(model_path)
|
||||
writer_wrapper.add_tokeniser(tok_config)
|
||||
logger.info("Tokeniser added")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not add tokeniser: {e}")
|
||||
|
||||
# Finalise file
|
||||
writer_wrapper.finalise()
|
||||
|
||||
file_size = fs.get_file_size(output_path)
|
||||
logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
|
||||
|
||||
return True
|
454
helpers/services/huggingface.py
Normal file
454
helpers/services/huggingface.py
Normal file
|
@ -0,0 +1,454 @@
|
|||
"""HuggingFace operations service.
|
||||
|
||||
Handles all interactions with HuggingFace including model downloads,
|
||||
uploads, README generation, and repository management. Uses UK English
|
||||
spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult
|
||||
|
||||
|
||||
class HuggingFaceService:
|
||||
"""Manages HuggingFace repository operations.
|
||||
|
||||
Provides methods for downloading models, uploading files, and managing
|
||||
repositories. Handles authentication, error recovery, and progress tracking
|
||||
for robust interaction with HuggingFace services.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Retrieves the current user's HuggingFace username using the CLI.
|
||||
Requires prior authentication via `huggingface-cli login`.
|
||||
|
||||
Returns:
|
||||
HuggingFace username.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated or CLI not available.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
@staticmethod
|
||||
def download_model(
|
||||
model_name: str, output_dir: Path, include_pattern: str | None = None
|
||||
) -> None:
|
||||
"""Download model from HuggingFace.
|
||||
|
||||
Downloads a complete model or specific files matching a pattern.
|
||||
Creates the output directory if it doesn't exist. Supports filtered
|
||||
downloads for efficient bandwidth usage when only certain files are needed.
|
||||
"""
|
||||
logger.info(f"Downloading {model_name} to {output_dir}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_name,
|
||||
"--local-dir",
|
||||
str(output_dir),
|
||||
]
|
||||
|
||||
if include_pattern:
|
||||
cmd.extend(["--include", include_pattern])
|
||||
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info("Download complete")
|
||||
|
||||
@staticmethod
|
||||
def upload_file(
|
||||
repo_id: str,
|
||||
local_path: Path,
|
||||
repo_path: str | None = None,
|
||||
create_repo: bool = False,
|
||||
) -> None:
|
||||
"""Upload a file to HuggingFace repository.
|
||||
|
||||
Uploads a single file to the specified repository path. Can create
|
||||
the repository if it doesn't exist. Handles repository creation conflicts
|
||||
gracefully by retrying without the create flag when needed.
|
||||
|
||||
Raises:
|
||||
CalledProcessError: If upload fails.
|
||||
"""
|
||||
repo_path = repo_path or local_path.name
|
||||
logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
|
||||
|
||||
cmd = [
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
repo_id,
|
||||
str(local_path),
|
||||
repo_path,
|
||||
]
|
||||
|
||||
if create_repo:
|
||||
cmd.append("--create")
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True)
|
||||
logger.info(f"Uploaded {repo_path}")
|
||||
except subprocess.CalledProcessError:
|
||||
if create_repo:
|
||||
# Repository might already exist, retry without --create
|
||||
cmd = cmd[:-1] # Remove --create flag
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info(f"Updated {repo_path}")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
class ReadmeGenerator:
|
||||
"""Generates README files for quantised models.
|
||||
|
||||
Creates comprehensive README documentation including model cards,
|
||||
quantisation details, and status tracking. Supports both initial
|
||||
planning documentation and final result summaries.
|
||||
"""
|
||||
|
||||
def generate(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
models_dir: Path,
|
||||
output_repo: str | None = None,
|
||||
) -> Path:
|
||||
"""Generate README file for quantised model repository.
|
||||
|
||||
Creates a comprehensive README with frontmatter, quantisation table,
|
||||
and original model information. Handles status tracking for planned,
|
||||
processing, and completed quantisations.
|
||||
|
||||
Returns:
|
||||
Path to generated README file.
|
||||
"""
|
||||
logger.info("Creating model card...")
|
||||
|
||||
model_dir = models_dir / model_source.model_name
|
||||
readme_path = model_dir / "README.md"
|
||||
|
||||
# Get original README content
|
||||
original_content = self._get_original_readme(model_source, model_dir)
|
||||
|
||||
# Generate new README
|
||||
readme_content = self._generate_readme_content(
|
||||
model_source, results, original_content, output_repo
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
return readme_path
|
||||
|
||||
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
|
||||
"""Extract original README and metadata.
|
||||
|
||||
Downloads or reads the original model's README for inclusion in the
|
||||
quantised model documentation. Parses YAML frontmatter if present.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content, licence, tags, and frontmatter.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
# Try local file first
|
||||
readme_path = model_dir / "README.md"
|
||||
if readme_path.exists():
|
||||
content["readme"] = readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Found original README ({len(content['readme'])} characters)")
|
||||
else:
|
||||
# Download separately
|
||||
content = self._download_readme(model_source)
|
||||
|
||||
# Parse frontmatter if present
|
||||
if content["readme"].startswith("---\n"):
|
||||
content = self._parse_frontmatter(content["readme"])
|
||||
|
||||
return content
|
||||
|
||||
def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
|
||||
"""Download README from HuggingFace repository.
|
||||
|
||||
Attempts to download just the README.md file from the source repository
|
||||
for efficient documentation extraction.
|
||||
|
||||
Returns:
|
||||
Dictionary with readme content and default metadata.
|
||||
"""
|
||||
content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
try:
|
||||
logger.info(f"Downloading README from {model_source.source_model}...")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_source.source_model,
|
||||
"--include",
|
||||
"README.md",
|
||||
"--local-dir",
|
||||
temp_dir,
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
readme_path = Path(temp_dir) / "README.md"
|
||||
if readme_path.exists():
|
||||
content["readme"] = readme_path.read_text(encoding="utf-8")
|
||||
logger.info(f"Downloaded README ({len(content['readme'])} characters)")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.warning(f"Failed to download README: {e}")
|
||||
|
||||
return content
|
||||
|
||||
def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
|
||||
"""Parse YAML frontmatter from README.
|
||||
|
||||
Extracts metadata from YAML frontmatter including licence, tags,
|
||||
and other model card fields.
|
||||
|
||||
Returns:
|
||||
Dictionary with separated content and metadata.
|
||||
"""
|
||||
lines = readme_text.split("\n")
|
||||
if lines[0] != "---":
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter_end = -1
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
if line == "---":
|
||||
frontmatter_end = i
|
||||
break
|
||||
|
||||
if frontmatter_end == -1:
|
||||
return {
|
||||
"readme": readme_text,
|
||||
"licence": "apache-2.0",
|
||||
"tags": "",
|
||||
"frontmatter": "",
|
||||
}
|
||||
|
||||
frontmatter = "\n".join(lines[1:frontmatter_end])
|
||||
content = "\n".join(lines[frontmatter_end + 1 :])
|
||||
|
||||
# Extract licence
|
||||
licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
|
||||
licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
|
||||
|
||||
# Extract tags
|
||||
tags = []
|
||||
in_tags = False
|
||||
for line in frontmatter.split("\n"):
|
||||
if line.startswith("tags:"):
|
||||
in_tags = True
|
||||
continue
|
||||
if in_tags:
|
||||
if line.startswith("- "):
|
||||
tags.append(line[2:].strip())
|
||||
elif line and not line.startswith(" "):
|
||||
break
|
||||
|
||||
return {
|
||||
"readme": content,
|
||||
"licence": licence_val,
|
||||
"tags": ",".join(tags),
|
||||
"frontmatter": frontmatter,
|
||||
}
|
||||
|
||||
def _generate_readme_content(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_content: dict[str, str],
|
||||
output_repo: str | None = None,
|
||||
) -> str:
|
||||
"""Generate complete README content with quantisation details.
|
||||
|
||||
Creates the full README including YAML frontmatter, quantisation status
|
||||
table, and original model information.
|
||||
|
||||
Returns:
|
||||
Complete README markdown content.
|
||||
"""
|
||||
# Build tags
|
||||
our_tags = [
|
||||
"quantised",
|
||||
"gguf",
|
||||
"q4_k_m",
|
||||
"q4_k_l",
|
||||
"q4_k_xl",
|
||||
"q4_k_xxl",
|
||||
"bartowski-method",
|
||||
]
|
||||
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
|
||||
all_tags = sorted(set(our_tags + original_tags))
|
||||
|
||||
# Build frontmatter
|
||||
frontmatter = f"""---
|
||||
license: {original_content["licence"]}
|
||||
library_name: gguf
|
||||
base_model: {model_source.source_model}
|
||||
tags:
|
||||
"""
|
||||
for tag in all_tags:
|
||||
if tag.strip():
|
||||
frontmatter += f"- {tag.strip()}\n"
|
||||
|
||||
frontmatter += "---\n\n"
|
||||
|
||||
# Build main content
|
||||
hf_url = f"https://huggingface.co/{model_source.source_model}"
|
||||
content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
|
||||
|
||||
GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's method.
|
||||
|
||||
| Quantisation | Embeddings/Output | Attention | Feed-Forward | Status |
|
||||
|--------------|-------------------|-----------|--------------|--------|
|
||||
"""
|
||||
|
||||
# Add results table
|
||||
for quant_type in [
|
||||
QuantisationType.Q4_K_M,
|
||||
QuantisationType.Q4_K_L,
|
||||
QuantisationType.Q4_K_XL,
|
||||
QuantisationType.Q4_K_XXL,
|
||||
]:
|
||||
result = results.get(quant_type)
|
||||
if not result:
|
||||
result = type("Result", (), {"status": "planned", "success": False})()
|
||||
|
||||
layers = self._get_layers_config(quant_type)
|
||||
status = self._format_status(result, model_source, quant_type, output_repo)
|
||||
|
||||
content += (
|
||||
f"| {quant_type.value} | {layers['embeddings']} | "
|
||||
f"{layers['attention']} | {layers['ffn']} | {status} |\n"
|
||||
)
|
||||
|
||||
content += "\n---\n\n"
|
||||
|
||||
# Add original content
|
||||
if original_content["readme"]:
|
||||
content += "# Original Model Information\n\n" + original_content["readme"]
|
||||
else:
|
||||
content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model}).\n"
|
||||
|
||||
return frontmatter + content
|
||||
|
||||
def _get_layers_config(self, quant_type: QuantisationType) -> dict[str, str]:
|
||||
"""Get layer configuration for quantisation type.
|
||||
|
||||
Returns layer precision specifications for the quantisation table.
|
||||
|
||||
Returns:
|
||||
Dictionary with embeddings, attention, and ffn precision labels.
|
||||
"""
|
||||
configs = {
|
||||
QuantisationType.Q4_K_M: {
|
||||
"embeddings": "Q4_K_M",
|
||||
"attention": "Q4_K_M",
|
||||
"ffn": "Q4_K_M",
|
||||
},
|
||||
QuantisationType.Q4_K_L: {"embeddings": "Q6_K", "attention": "Q6_K", "ffn": "Q4_K_M"},
|
||||
QuantisationType.Q4_K_XL: {"embeddings": "Q8_0", "attention": "Q6_K", "ffn": "Q4_K_M"},
|
||||
QuantisationType.Q4_K_XXL: {"embeddings": "Q8_0", "attention": "Q8_0", "ffn": "Q4_K_M"},
|
||||
}
|
||||
return configs.get(
|
||||
quant_type, {"embeddings": "Unknown", "attention": "Unknown", "ffn": "Unknown"}
|
||||
)
|
||||
|
||||
def _format_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format status indicator for README table.
|
||||
|
||||
Creates appropriate status indicator based on quantisation state
|
||||
including progress indicators, file sizes, and download links.
|
||||
|
||||
Returns:
|
||||
Formatted status string for table cell.
|
||||
"""
|
||||
status_map = {
|
||||
"planned": "⏳ Planned",
|
||||
"processing": "🔄 Processing...",
|
||||
"uploading": "⬆️ Uploading...",
|
||||
"failed": "❌ Failed",
|
||||
}
|
||||
|
||||
if hasattr(result, "status") and result.status in status_map:
|
||||
base_status = status_map[result.status]
|
||||
|
||||
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
|
||||
return f"{base_status} ({result.file_size})"
|
||||
if result.status == "completed" or (hasattr(result, "success") and result.success):
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return base_status
|
||||
|
||||
# Legacy support
|
||||
if hasattr(result, "success") and result.success:
|
||||
return self._format_success_status(result, model_source, quant_type, output_repo)
|
||||
return "❌ Failed"
|
||||
|
||||
def _format_success_status(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
model_source: ModelSource,
|
||||
quant_type: QuantisationType,
|
||||
output_repo: str | None,
|
||||
) -> str:
|
||||
"""Format successful quantisation status with download link.
|
||||
|
||||
Creates a download link if repository information is available,
|
||||
otherwise shows file size.
|
||||
|
||||
Returns:
|
||||
Formatted success status string.
|
||||
"""
|
||||
if not output_repo:
|
||||
return (
|
||||
f"✅ {result.file_size}"
|
||||
if hasattr(result, "file_size") and result.file_size
|
||||
else "✅ Available"
|
||||
)
|
||||
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
|
||||
)
|
||||
url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
|
||||
|
||||
if hasattr(result, "file_size") and result.file_size:
|
||||
return f"[✅ {result.file_size}]({url})"
|
||||
return f"[✅ Available]({url})"
|
417
helpers/services/llama_cpp.py
Normal file
417
helpers/services/llama_cpp.py
Normal file
|
@ -0,0 +1,417 @@
|
|||
"""llama.cpp environment and operations service.
|
||||
|
||||
Manages llama.cpp binary discovery, environment setup, and imatrix generation.
|
||||
Provides consistent interface for interacting with llama.cpp tools across
|
||||
different installation methods.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import LlamaCppEnvironment
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
|
||||
class EnvironmentManager:
|
||||
"""Manages llama.cpp environment setup and binary discovery.
|
||||
|
||||
Handles detection of local binaries, repository setup, and conversion
|
||||
script location. Provides fallback strategies for different installation
|
||||
scenarios including local builds and repository-based setups.
|
||||
"""
|
||||
|
||||
def __init__(self, work_dir: Path) -> None:
|
||||
"""Initialise EnvironmentManager."""
|
||||
self.work_dir = work_dir
|
||||
self.llama_cpp_dir = work_dir / "llama.cpp"
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def setup(self) -> LlamaCppEnvironment:
|
||||
"""Set up llama.cpp environment with automatic detection.
|
||||
|
||||
Checks for local llama.cpp binaries first, then falls back to
|
||||
repository-based setup if needed. Handles conversion script location,
|
||||
dependency installation, and path resolution.
|
||||
|
||||
Returns:
|
||||
Configured LlamaCppEnvironment instance.
|
||||
"""
|
||||
# Check for local binaries first
|
||||
local_env = self._check_local_binaries()
|
||||
if local_env:
|
||||
return local_env
|
||||
|
||||
# Setup repository if needed
|
||||
return self.setup_repository()
|
||||
|
||||
def _check_local_binaries(self) -> LlamaCppEnvironment | None:
|
||||
"""Check for existing llama.cpp binaries in current directory.
|
||||
|
||||
Searches for quantise and CLI binaries in the current directory
|
||||
and standard installation paths. Also locates conversion scripts.
|
||||
|
||||
Returns:
|
||||
LlamaCppEnvironment if binaries found, None otherwise.
|
||||
"""
|
||||
quantise_bin = Path("./llama-quantize")
|
||||
cli_bin = Path("./llama-cli")
|
||||
|
||||
if not (quantise_bin.exists() and cli_bin.exists()):
|
||||
return None
|
||||
|
||||
logger.info("Found llama.cpp binaries in current directory")
|
||||
|
||||
# Check for conversion script
|
||||
convert_script = self._find_convert_script()
|
||||
if convert_script:
|
||||
logger.info(f"Found conversion script: {convert_script}")
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=quantise_bin.resolve(),
|
||||
cli_binary=cli_bin.resolve(),
|
||||
convert_script=convert_script,
|
||||
use_repo=False,
|
||||
)
|
||||
|
||||
logger.warning("No conversion script found in current directory")
|
||||
logger.info("Will use llama.cpp repository method for conversion")
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=quantise_bin.resolve(),
|
||||
cli_binary=cli_bin.resolve(),
|
||||
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
|
||||
use_repo=True,
|
||||
)
|
||||
|
||||
def _find_convert_script(self) -> str | None:
|
||||
"""Find conversion script in current directory.
|
||||
|
||||
Searches for various naming conventions of the HF to GGUF
|
||||
conversion script.
|
||||
|
||||
Returns:
|
||||
Command to run conversion script, or None if not found.
|
||||
"""
|
||||
scripts = [
|
||||
"./llama-convert-hf-to-gguf",
|
||||
"python3 ./convert_hf_to_gguf.py",
|
||||
"python3 ./convert-hf-to-gguf.py",
|
||||
]
|
||||
|
||||
for script in scripts:
|
||||
if script.startswith("python3"):
|
||||
script_path = script.split(" ", 1)[1]
|
||||
if Path(script_path).exists():
|
||||
return script
|
||||
elif Path(script).exists():
|
||||
return script
|
||||
return None
|
||||
|
||||
def setup_repository(self) -> LlamaCppEnvironment:
|
||||
"""Setup llama.cpp repository for conversion scripts.
|
||||
|
||||
Clones the llama.cpp repository if not present and installs
|
||||
Python dependencies for model conversion.
|
||||
|
||||
Returns:
|
||||
LlamaCppEnvironment configured with repository paths.
|
||||
"""
|
||||
if not self.llama_cpp_dir.exists():
|
||||
logger.info("Cloning llama.cpp for conversion script...")
|
||||
subprocess.run(
|
||||
[
|
||||
"git",
|
||||
"clone",
|
||||
"https://github.com/ggerganov/llama.cpp.git",
|
||||
str(self.llama_cpp_dir),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Install Python requirements
|
||||
logger.info("Installing Python requirements...")
|
||||
subprocess.run(
|
||||
[
|
||||
"pip3",
|
||||
"install",
|
||||
"-r",
|
||||
"requirements.txt",
|
||||
"--break-system-packages",
|
||||
"--root-user-action=ignore",
|
||||
],
|
||||
cwd=self.llama_cpp_dir,
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Install additional conversion dependencies
|
||||
logger.info("Installing additional conversion dependencies...")
|
||||
subprocess.run(
|
||||
[
|
||||
"pip3",
|
||||
"install",
|
||||
"transformers",
|
||||
"sentencepiece",
|
||||
"protobuf",
|
||||
"--break-system-packages",
|
||||
"--root-user-action=ignore",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("llama.cpp repository already exists")
|
||||
|
||||
# Use local binaries but repo conversion script
|
||||
return LlamaCppEnvironment(
|
||||
quantise_binary=Path("./llama-quantize").resolve(),
|
||||
cli_binary=Path("./llama-cli").resolve(),
|
||||
convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
|
||||
use_repo=False,
|
||||
)
|
||||
|
||||
|
||||
class IMatrixGenerator:
|
||||
"""Handles importance matrix generation for quantisation guidance.
|
||||
|
||||
Generates or locates importance matrices that guide quantisation
|
||||
decisions, helping preserve model quality by identifying critical
|
||||
tensors requiring higher precision.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixGenerator."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def generate_imatrix(
|
||||
self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Generate importance matrix for quantisation guidance.
|
||||
|
||||
Searches for existing imatrix files first, provides interactive
|
||||
prompts for user-supplied matrices, then generates new matrices
|
||||
using calibration data if necessary.
|
||||
|
||||
Returns:
|
||||
Path to imatrix file, or None if generation fails.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
# Check for existing imatrix
|
||||
if imatrix_path.exists():
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name}")
|
||||
return imatrix_path
|
||||
|
||||
# Try user-provided imatrix
|
||||
user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
|
||||
if user_imatrix:
|
||||
return user_imatrix
|
||||
|
||||
# Generate new imatrix
|
||||
calibration_file = self._get_calibration_file()
|
||||
if not calibration_file:
|
||||
return None
|
||||
|
||||
return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)
|
||||
|
||||
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
|
||||
"""Prompt user for existing imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to user-provided imatrix, or None if not available.
|
||||
"""
|
||||
logger.info(f"Model directory: {model_dir}")
|
||||
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
||||
logger.info(
|
||||
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
|
||||
)
|
||||
logger.info(
|
||||
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
|
||||
)
|
||||
|
||||
response = (
|
||||
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
|
||||
if response != "y":
|
||||
return None
|
||||
|
||||
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"Found imatrix file! ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
logger.warning("No imatrix.dat file found - continuing with automatic generation")
|
||||
return None
|
||||
|
||||
def _get_calibration_file(self) -> Path | None:
|
||||
"""Get calibration data file for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Path to calibration file, or None if not found.
|
||||
"""
|
||||
calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
|
||||
if not calibration_file.exists():
|
||||
logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
|
||||
logger.info(
|
||||
"Download from: https://gist.githubusercontent.com/bartowski1182/"
|
||||
"eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
|
||||
)
|
||||
return None
|
||||
return calibration_file
|
||||
|
||||
def _generate_new_imatrix(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
llama_env: LlamaCppEnvironment,
|
||||
imatrix_path: Path,
|
||||
calibration_file: Path,
|
||||
) -> Path | None:
|
||||
"""Generate new importance matrix using calibration data.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix, or None if generation fails.
|
||||
"""
|
||||
logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
|
||||
logger.info(f"Model: {f16_model_path.name}")
|
||||
logger.info(f"Calibration: {calibration_file}")
|
||||
logger.info(f"Output: {imatrix_path}")
|
||||
|
||||
# Find imatrix binary
|
||||
imatrix_binary = self._find_imatrix_binary(llama_env)
|
||||
if not imatrix_binary:
|
||||
logger.warning("llama-imatrix binary not found - skipping imatrix generation")
|
||||
logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
|
||||
return None
|
||||
|
||||
# Build and execute command
|
||||
cmd = self._build_imatrix_command(
|
||||
imatrix_binary, f16_model_path, calibration_file, imatrix_path
|
||||
)
|
||||
return self._execute_imatrix_generation(cmd, imatrix_path)
|
||||
|
||||
def _build_imatrix_command(
|
||||
self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
|
||||
) -> list[str]:
|
||||
"""Build imatrix generation command.
|
||||
|
||||
Returns:
|
||||
Command arguments as list.
|
||||
"""
|
||||
return [
|
||||
str(binary),
|
||||
"-m",
|
||||
str(model_path),
|
||||
"-f",
|
||||
str(calibration_file),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--process-output",
|
||||
"--output-frequency",
|
||||
"10",
|
||||
"--save-frequency",
|
||||
"50",
|
||||
"-t",
|
||||
"8",
|
||||
"-c",
|
||||
"2048",
|
||||
"-b",
|
||||
"512",
|
||||
]
|
||||
|
||||
def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
|
||||
"""Execute imatrix generation command with real-time output.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix file, or None if generation fails.
|
||||
"""
|
||||
logger.info(f"Running: {' '.join(cmd)}")
|
||||
logger.info("Starting imatrix generation... (progress will be shown)")
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
self._stream_imatrix_output(process)
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
return self._validate_imatrix_output(imatrix_path)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("imatrix generation cancelled by user")
|
||||
process.terminate()
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"imatrix generation failed with exception: {e}")
|
||||
return None
|
||||
else:
|
||||
logger.error(f"imatrix generation failed with return code {return_code}")
|
||||
return None
|
||||
|
||||
def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
|
||||
"""Stream imatrix generation output in real-time."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
line = output.strip()
|
||||
if self._should_log_imatrix_line(line):
|
||||
logger.info(line)
|
||||
|
||||
def _should_log_imatrix_line(self, line: str) -> bool:
|
||||
"""Determine if imatrix output line should be logged.
|
||||
|
||||
Returns:
|
||||
True if line should be logged, False otherwise.
|
||||
"""
|
||||
keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
|
||||
return any(keyword in line for keyword in keywords) or line.startswith("[")
|
||||
|
||||
def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
|
||||
"""Validate generated imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to imatrix if valid, None otherwise.
|
||||
"""
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"imatrix generation successful! ({file_size})")
|
||||
return imatrix_path
|
||||
logger.error("imatrix generation completed but file not found")
|
||||
return None
|
||||
|
||||
def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
|
||||
"""Find llama-imatrix binary in common locations.
|
||||
|
||||
Searches for the imatrix binary in the current directory and
|
||||
standard installation paths.
|
||||
|
||||
Returns:
|
||||
Path to imatrix binary, or None if not found.
|
||||
"""
|
||||
candidates = [
|
||||
Path("./llama-imatrix"),
|
||||
llama_env.quantise_binary.parent / "llama-imatrix",
|
||||
Path("/usr/local/bin/llama-imatrix"),
|
||||
Path("/usr/bin/llama-imatrix"),
|
||||
]
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and candidate.is_file():
|
||||
return candidate
|
||||
|
||||
return None
|
397
helpers/services/orchestrator.py
Normal file
397
helpers/services/orchestrator.py
Normal file
|
@ -0,0 +1,397 @@
|
|||
"""Quantisation orchestration service.
|
||||
|
||||
High-level orchestration of the complete quantisation workflow from model
|
||||
acquisition through processing to upload. Manages parallel processing,
|
||||
status tracking, and cleanup operations for efficient resource utilisation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.huggingface import ReadmeGenerator
|
||||
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
|
||||
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
||||
from helpers.utils.tensor_mapping import URLParser
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class QuantisationOrchestrator:
|
||||
"""Orchestrates the complete quantisation workflow.
|
||||
|
||||
Uses dataclass with slots for efficient memory usage and dependency injection
|
||||
for modular service interaction following SOLID principles.
|
||||
"""
|
||||
|
||||
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
||||
use_imatrix: bool = True
|
||||
imatrix_base: str = "Q4_K_M"
|
||||
no_upload: bool = False
|
||||
|
||||
# Service dependencies with factory defaults
|
||||
url_parser: URLParser = field(default_factory=URLParser)
|
||||
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
||||
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
||||
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
||||
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
||||
|
||||
# Computed properties
|
||||
models_dir: Path = field(init=False)
|
||||
environment_manager: EnvironmentManager = field(init=False)
|
||||
model_manager: ModelManager = field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialise computed properties after dataclass construction."""
|
||||
self.models_dir = self.work_dir / "models"
|
||||
self.environment_manager = EnvironmentManager(self.work_dir)
|
||||
self.model_manager = ModelManager(self.models_dir, self.environment_manager)
|
||||
|
||||
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
"""
|
||||
logger.info("Starting Bartowski quantisation process...")
|
||||
|
||||
# Setup and preparation
|
||||
model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
|
||||
self._setup_environment(url)
|
||||
)
|
||||
|
||||
# Create initial repository
|
||||
self._create_initial_repository(model_source, output_repo)
|
||||
|
||||
# Execute all quantisations
|
||||
results = self._execute_quantisations(
|
||||
model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
||||
)
|
||||
|
||||
# Cleanup
|
||||
self._cleanup_files(f16_model_path, model_source)
|
||||
|
||||
self._print_completion_summary(model_source, results, output_repo)
|
||||
return results
|
||||
|
||||
def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
|
||||
"""Setup environment and prepare model for quantisation.
|
||||
|
||||
Returns:
|
||||
Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
|
||||
"""
|
||||
model_source = self.url_parser.parse(url)
|
||||
self._print_model_info(model_source)
|
||||
|
||||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
llama_env = self.environment_manager.setup()
|
||||
|
||||
f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Generating importance matrix (imatrix)...")
|
||||
imatrix_path = self.imatrix_generator.generate_imatrix(
|
||||
f16_model_path, llama_env, self.models_dir / model_source.model_name
|
||||
)
|
||||
|
||||
output_repo = (
|
||||
f"{self.uploader.get_username()}/"
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
)
|
||||
|
||||
return model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
||||
|
||||
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
||||
"""Create initial repository with planned quantisations."""
|
||||
logger.info("Creating initial README with planned quantisations...")
|
||||
planned_results = {
|
||||
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
||||
for qt in SUPPORTED_QUANTISATION_TYPES
|
||||
}
|
||||
readme_path = self.readme_generator.generate(
|
||||
model_source, planned_results, self.models_dir, output_repo
|
||||
)
|
||||
|
||||
if not self.no_upload:
|
||||
logger.info("Creating repository with planned quantisations...")
|
||||
self.uploader.upload_readme(output_repo, readme_path)
|
||||
else:
|
||||
logger.info("Skipping repository creation (--no-upload specified)")
|
||||
|
||||
def _execute_quantisations(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
llama_env: Any,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
) -> dict[QuantisationType, QuantisationResult]:
|
||||
"""Execute all quantisation types with parallel uploads.
|
||||
|
||||
Returns:
|
||||
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
||||
"""
|
||||
results: dict[QuantisationType, QuantisationResult] = {}
|
||||
upload_futures: list[Future[None]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
|
||||
for quant_type in SUPPORTED_QUANTISATION_TYPES:
|
||||
result = self._process_single_quantisation(
|
||||
quant_type,
|
||||
model_source,
|
||||
llama_env,
|
||||
f16_model_path,
|
||||
imatrix_path,
|
||||
output_repo,
|
||||
results,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
results[quant_type] = result
|
||||
|
||||
self._wait_for_uploads(upload_futures)
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_quantisation(
|
||||
self,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
llama_env: Any,
|
||||
f16_model_path: Path,
|
||||
imatrix_path: Path | None,
|
||||
output_repo: str,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> QuantisationResult:
|
||||
"""Process a single quantisation type.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Result of the quantisation attempt.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting {quant_type.value} quantisation...")
|
||||
config = QUANTISATION_CONFIGS[quant_type]
|
||||
|
||||
# Update status to processing
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "processing"
|
||||
results[quant_type] = result
|
||||
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
# Perform quantisation
|
||||
context = QuantisationContext(
|
||||
f16_model_path=f16_model_path,
|
||||
model_source=model_source,
|
||||
config=config,
|
||||
llama_env=llama_env,
|
||||
models_dir=self.models_dir,
|
||||
imatrix_path=imatrix_path,
|
||||
base_quant=self.imatrix_base,
|
||||
)
|
||||
result = self.quantisation_engine.quantise(context)
|
||||
|
||||
self._handle_quantisation_result(
|
||||
result,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
output_repo,
|
||||
upload_executor,
|
||||
upload_futures,
|
||||
)
|
||||
except Exception as e:
|
||||
return self._handle_quantisation_error(
|
||||
e, quant_type, model_source, results, output_repo
|
||||
)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _handle_quantisation_result(
|
||||
self,
|
||||
result: QuantisationResult,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
upload_executor: ThreadPoolExecutor,
|
||||
upload_futures: list,
|
||||
) -> None:
|
||||
"""Handle successful or failed quantisation result."""
|
||||
if result.success and result.file_path:
|
||||
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
||||
logger.info(f"Starting parallel upload of {quant_str}...")
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_and_cleanup,
|
||||
output_repo,
|
||||
result.file_path,
|
||||
quant_type,
|
||||
model_source,
|
||||
results,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
result.file_path = None # Mark as being uploaded
|
||||
result.status = "uploading"
|
||||
else:
|
||||
result.status = "failed"
|
||||
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
def _handle_quantisation_error(
|
||||
self,
|
||||
error: Exception,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> QuantisationResult:
|
||||
"""Handle quantisation processing error.
|
||||
|
||||
Returns:
|
||||
QuantisationResult: Failed quantisation result with error information.
|
||||
"""
|
||||
logger.error(f"Error processing {quant_type.value}: {error}")
|
||||
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
||||
result.status = "failed"
|
||||
result.error_message = str(error)
|
||||
|
||||
try:
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
except Exception as readme_error:
|
||||
logger.error(f"Failed to update README after error: {readme_error}")
|
||||
|
||||
return result
|
||||
|
||||
def _update_readme_status(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Update README with current quantisation status."""
|
||||
if not self.no_upload:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
def _wait_for_uploads(self, upload_futures: list) -> None:
|
||||
"""Wait for all parallel uploads to complete."""
|
||||
logger.info("Waiting for any remaining uploads to complete...")
|
||||
for future in upload_futures:
|
||||
try:
|
||||
future.result(timeout=300) # 5 minute timeout per upload
|
||||
except Exception as e:
|
||||
logger.warning(f"Upload error: {e}")
|
||||
|
||||
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
||||
"""Clean up temporary files after processing."""
|
||||
if f16_model_path.exists():
|
||||
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
||||
f16_model_path.unlink()
|
||||
|
||||
if not model_source.is_gguf_repo:
|
||||
self._cleanup_original_model(model_source)
|
||||
|
||||
def _cleanup_original_model(self, model_source: ModelSource) -> None:
|
||||
"""Clean up original safetensors/PyTorch files after successful conversion."""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
|
||||
if pytorch_files:
|
||||
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
|
||||
for file in pytorch_files:
|
||||
file.unlink()
|
||||
|
||||
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
||||
|
||||
def _upload_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
quant_type: QuantisationType,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
) -> None:
|
||||
"""Upload file and clean up (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Uploading {quant_type}...")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
|
||||
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
|
||||
file_path.unlink()
|
||||
|
||||
results[quant_type].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
||||
results[quant_type].status = "failed"
|
||||
results[quant_type].error_message = str(e)
|
||||
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
raise
|
||||
|
||||
def _print_model_info(self, model_source: ModelSource) -> None:
|
||||
"""Print model information."""
|
||||
logger.info(f"Source URL: {model_source.url}")
|
||||
logger.info(f"Source model: {model_source.source_model}")
|
||||
logger.info(f"Original author: {model_source.original_author}")
|
||||
logger.info(f"Model name: {model_source.model_name}")
|
||||
logger.info(f"Your HF username: {self.uploader.get_username()}")
|
||||
logger.info(f"Working directory: {self.work_dir}")
|
||||
|
||||
def _print_completion_summary(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
results: dict[QuantisationType, QuantisationResult],
|
||||
output_repo: str,
|
||||
) -> None:
|
||||
"""Print completion summary."""
|
||||
successful_results = [r for r in results.values() if r.success]
|
||||
|
||||
if successful_results:
|
||||
logger.info("Complete! Your quantised models are available at:")
|
||||
logger.info(f" https://huggingface.co/{output_repo}")
|
||||
logger.info("Model info:")
|
||||
logger.info(f" - Source URL: {model_source.url}")
|
||||
logger.info(f" - Original: {model_source.source_model}")
|
||||
logger.info(
|
||||
" - Method: "
|
||||
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
||||
)
|
||||
logger.info(f" - Quantised: {output_repo}")
|
||||
|
||||
for result in successful_results:
|
||||
if result.file_size:
|
||||
filename = (
|
||||
f"{model_source.original_author}-{model_source.model_name}-"
|
||||
f"{result.quantisation_type}.gguf"
|
||||
)
|
||||
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
||||
else:
|
||||
logger.error(
|
||||
"All quantisations failed - repository created with documentation "
|
||||
"but no model files"
|
||||
)
|
||||
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|
486
helpers/services/quantisation.py
Normal file
486
helpers/services/quantisation.py
Normal file
|
@ -0,0 +1,486 @@
|
|||
"""Quantisation operations service.
|
||||
|
||||
Provides modular quantisation engine, model management, and upload capabilities
|
||||
for GGUF model processing. Consolidates quantisation logic from various tools
|
||||
into reusable components following SOLID principles.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import (
|
||||
ModelSource,
|
||||
QuantisationContext,
|
||||
QuantisationResult,
|
||||
QuantisationType,
|
||||
)
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from helpers.models.quantisation import LlamaCppEnvironment
|
||||
from helpers.services.llama_cpp import EnvironmentManager
|
||||
|
||||
|
||||
class QuantisationEngine:
|
||||
"""Handles the actual quantisation process with configurable methods.
|
||||
|
||||
Provides flexible quantisation execution supporting multiple tensor
|
||||
precision configurations, importance matrices, and fallback strategies.
|
||||
Encapsulates llama-quantize binary interactions with real-time output.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation engine."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
||||
"""Perform quantisation using the specified configuration.
|
||||
|
||||
Executes quantisation with primary and fallback methods, handling
|
||||
tensor-specific precision overrides and importance matrix guidance.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with success status and file information.
|
||||
"""
|
||||
logger.info(
|
||||
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
||||
)
|
||||
|
||||
output_path = context.get_output_path()
|
||||
|
||||
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
||||
logger.info(f"📝 Source: {context.f16_model_path}")
|
||||
logger.info(f"📝 Target: {output_path}")
|
||||
|
||||
# Try primary method
|
||||
if self._try_quantisation_method(
|
||||
context, output_path, context.config.tensor_types, "method 1"
|
||||
):
|
||||
return self._create_success_result(context.config.name, output_path, "method 1")
|
||||
|
||||
# Try fallback methods
|
||||
for i, fallback_method in enumerate(context.config.fallback_methods, 2):
|
||||
method_name = f"method {i}"
|
||||
if self._try_quantisation_method(context, output_path, fallback_method, method_name):
|
||||
return self._create_success_result(context.config.name, output_path, method_name)
|
||||
|
||||
logger.error("All %s quantisation methods failed", context.config.name)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(context.config.name),
|
||||
success=False,
|
||||
error_message="All quantisation methods failed",
|
||||
)
|
||||
|
||||
def _try_quantisation_method(
|
||||
self,
|
||||
context: QuantisationContext,
|
||||
output_path: Path,
|
||||
tensor_config: dict[str, str],
|
||||
method_name: str,
|
||||
) -> bool:
|
||||
"""Try a specific quantisation method with real-time output.
|
||||
|
||||
Builds and executes llama-quantize command with appropriate parameters,
|
||||
streaming output for progress monitoring.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"🔍 Trying {method_name}...")
|
||||
|
||||
cmd = self._build_quantisation_command(context, output_path, tensor_config)
|
||||
return self._execute_quantisation_command(cmd, method_name)
|
||||
|
||||
def _build_quantisation_command(
|
||||
self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
|
||||
) -> list[str]:
|
||||
"""Build quantisation command with all required parameters.
|
||||
|
||||
Returns:
|
||||
List of command arguments.
|
||||
"""
|
||||
cmd = [str(context.llama_env.quantise_binary)]
|
||||
|
||||
# Add imatrix if available
|
||||
if context.imatrix_path and context.imatrix_path.exists():
|
||||
cmd.extend(["--imatrix", str(context.imatrix_path)])
|
||||
logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
|
||||
|
||||
# Add tensor type arguments
|
||||
self._add_tensor_type_arguments(cmd, tensor_config)
|
||||
|
||||
cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
|
||||
return cmd
|
||||
|
||||
def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
|
||||
"""Add tensor type arguments to command."""
|
||||
if not tensor_config:
|
||||
return
|
||||
|
||||
for tensor_name, quant_type in tensor_config.items():
|
||||
if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
|
||||
cmd.extend([f"--{tensor_name}", quant_type])
|
||||
else:
|
||||
cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
|
||||
|
||||
def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
|
||||
"""Execute quantisation command with real-time output.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"💻 Running: {' '.join(cmd)}")
|
||||
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
self._stream_quantisation_output(process)
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
logger.info(f"✅ {method_name} quantisation successful!")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.info(f"❌ {method_name} failed with exception: {e}")
|
||||
return False
|
||||
else:
|
||||
logger.info(f"❌ {method_name} failed with return code {return_code}")
|
||||
return False
|
||||
|
||||
def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
|
||||
"""Stream quantisation output in real-time."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
logger.info(f"📊 {output.strip()}")
|
||||
|
||||
def _create_success_result(
|
||||
self, quant_type: str, output_path: Path, method_used: str
|
||||
) -> QuantisationResult:
|
||||
"""Create successful quantisation result with file metadata.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with file path and size information.
|
||||
"""
|
||||
file_size = self.fs.get_file_size(output_path)
|
||||
return QuantisationResult(
|
||||
quantisation_type=QuantisationType(quant_type),
|
||||
success=True,
|
||||
file_path=output_path,
|
||||
file_size=file_size,
|
||||
method_used=method_used,
|
||||
)
|
||||
|
||||
|
||||
class ModelManager:
|
||||
"""Handles model downloading and preparation for quantisation.
|
||||
|
||||
Manages both GGUF repository downloads and HuggingFace model conversions,
|
||||
providing unified interface for model acquisition and preparation.
|
||||
"""
|
||||
|
||||
def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
|
||||
"""Initialise model manager with storage and environment configuration.
|
||||
|
||||
Sets up model storage directory and links to environment manager for
|
||||
conversion script access and llama.cpp tool discovery.
|
||||
"""
|
||||
self.models_dir = models_dir
|
||||
self.environment_manager = environment_manager
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
|
||||
"""Prepare model for quantisation and return F16 model path.
|
||||
|
||||
Handles both GGUF repository downloads and regular HuggingFace model
|
||||
conversion workflows with automatic format detection.
|
||||
|
||||
Returns:
|
||||
Path to F16 GGUF model ready for quantisation.
|
||||
"""
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
|
||||
if model_source.is_gguf_repo:
|
||||
return self._handle_gguf_repo(model_source, model_dir)
|
||||
return self._handle_regular_repo(model_source, model_dir, llama_env)
|
||||
|
||||
def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
|
||||
"""Handle GGUF repository download with pattern matching.
|
||||
|
||||
Downloads GGUF files matching specified patterns, prioritising
|
||||
multi-part files and F16 variants.
|
||||
|
||||
Returns:
|
||||
Path to downloaded or existing GGUF file.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
||||
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
||||
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
||||
return f16_model
|
||||
|
||||
# Check for existing GGUF files
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
existing_gguf = self.fs.find_gguf_files(model_dir)
|
||||
|
||||
if existing_gguf:
|
||||
logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
|
||||
return existing_gguf[0]
|
||||
|
||||
# Download with patterns
|
||||
downloaded_file = self._download_gguf_with_patterns(
|
||||
model_source.source_model, model_source.gguf_file_pattern, model_dir
|
||||
)
|
||||
|
||||
if downloaded_file:
|
||||
# Handle multi-part files
|
||||
if "00001-of-" in downloaded_file.name:
|
||||
return downloaded_file
|
||||
if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
|
||||
base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
|
||||
"-00003-of-", "-00001-of-"
|
||||
)
|
||||
first_part = downloaded_file.parent / base_name
|
||||
if first_part.exists():
|
||||
logger.info(f"🔄 Using first part: {first_part.name}")
|
||||
return first_part
|
||||
|
||||
# Rename single file to standard name
|
||||
downloaded_file.rename(f16_model)
|
||||
return f16_model
|
||||
|
||||
# Fallback to regular conversion
|
||||
logger.info("💡 Falling back to downloading full repository and converting...")
|
||||
return self._handle_regular_repo(
|
||||
ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
|
||||
model_dir,
|
||||
None,
|
||||
)
|
||||
|
||||
def _download_gguf_with_patterns(
|
||||
self, source_model: str, pattern: str | None, model_dir: Path
|
||||
) -> Path | None:
|
||||
"""Download GGUF file using various pattern strategies.
|
||||
|
||||
Tries multiple pattern variations to find and download appropriate
|
||||
GGUF files, handling timeouts and temporary directories.
|
||||
|
||||
Returns:
|
||||
Path to downloaded file, or None if all patterns fail.
|
||||
"""
|
||||
if pattern:
|
||||
patterns = [
|
||||
f"*{pattern}*",
|
||||
f"*{pattern.lower()}*",
|
||||
f"*{pattern.upper()}*",
|
||||
"*f16*",
|
||||
"*F16*",
|
||||
"*fp16*",
|
||||
]
|
||||
else:
|
||||
patterns = ["*f16*", "*F16*", "*fp16*"]
|
||||
|
||||
temp_dir = model_dir / "gguf_temp"
|
||||
|
||||
for search_pattern in patterns:
|
||||
logger.info(f"🔍 Trying pattern: {search_pattern}")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"timeout",
|
||||
"300",
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
source_model,
|
||||
"--include",
|
||||
search_pattern,
|
||||
"--local-dir",
|
||||
str(temp_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Find downloaded GGUF files
|
||||
gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
|
||||
if gguf_files:
|
||||
found_file = gguf_files[0]
|
||||
logger.info(f"✅ Found GGUF file: {found_file.name}")
|
||||
|
||||
# Move to parent directory
|
||||
final_path = model_dir / found_file.name
|
||||
shutil.move(str(found_file), str(final_path))
|
||||
shutil.rmtree(temp_dir)
|
||||
return final_path
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
|
||||
continue
|
||||
finally:
|
||||
if temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
return None
|
||||
|
||||
def _handle_regular_repo(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
llama_env: LlamaCppEnvironment | None,
|
||||
) -> Path:
|
||||
"""Handle regular HuggingFace repository conversion.
|
||||
|
||||
Downloads full model repository and converts to F16 GGUF format
|
||||
using llama.cpp conversion scripts.
|
||||
|
||||
Returns:
|
||||
Path to converted F16 GGUF model.
|
||||
"""
|
||||
logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
|
||||
|
||||
if not model_dir.exists():
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
model_source.source_model,
|
||||
"--local-dir",
|
||||
str(model_dir),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("✅ Model already downloaded")
|
||||
|
||||
logger.info("🔄 Converting to GGUF F16 format...")
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
|
||||
if not f16_model.exists():
|
||||
if not llama_env:
|
||||
llama_env = self.environment_manager.setup()
|
||||
|
||||
# Ensure conversion script is available
|
||||
if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
|
||||
logger.info("Getting conversion script from llama.cpp repository...")
|
||||
llama_env = self.environment_manager.setup_repository()
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
*llama_env.convert_script.split(),
|
||||
str(model_dir),
|
||||
"--outtype",
|
||||
"f16",
|
||||
"--outfile",
|
||||
str(f16_model),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
else:
|
||||
logger.info("✅ F16 model already exists")
|
||||
|
||||
return f16_model
|
||||
|
||||
|
||||
class HuggingFaceUploader:
|
||||
"""Handles uploading models and documentation to HuggingFace.
|
||||
|
||||
Provides methods for repository creation, file uploads, and README
|
||||
updates with proper error handling and retry logic.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_username() -> str:
|
||||
"""Get authenticated HuggingFace username.
|
||||
|
||||
Returns:
|
||||
HuggingFace username from CLI authentication.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If not authenticated.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["huggingface-cli", "whoami"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError) as err:
|
||||
msg = "Please log in to HuggingFace first: huggingface-cli login"
|
||||
raise RuntimeError(msg) from err
|
||||
|
||||
def upload_readme(self, output_repo: str, readme_path: Path) -> None:
|
||||
"""Upload or update README file to repository.
|
||||
|
||||
Creates repository if needed, handles existing repository updates.
|
||||
"""
|
||||
logger.info("Uploading README...")
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
"--create",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
logger.info("README uploaded")
|
||||
except subprocess.CalledProcessError:
|
||||
# Repository exists, update without --create
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(readme_path),
|
||||
"README.md",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
logger.info("README updated")
|
||||
|
||||
def upload_model_file(self, output_repo: str, model_path: Path) -> None:
|
||||
"""Upload model file to repository.
|
||||
|
||||
Uploads GGUF model file to specified repository path.
|
||||
"""
|
||||
logger.info(f"Uploading {model_path.name}...")
|
||||
subprocess.run(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"upload",
|
||||
output_repo,
|
||||
str(model_path),
|
||||
model_path.name,
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
logger.info(f"{model_path.name} uploaded")
|
16
helpers/utils/__init__.py
Normal file
16
helpers/utils/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
"""Utility functions for llm-gguf-tools.
|
||||
|
||||
Provides low-level utilities for tensor mapping, configuration parsing,
|
||||
and other common operations. Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper, URLParser
|
||||
|
||||
__all__ = [
|
||||
"ConfigParser",
|
||||
"TensorMapper",
|
||||
"URLParser",
|
||||
]
|
171
helpers/utils/config_parser.py
Normal file
171
helpers/utils/config_parser.py
Normal file
|
@ -0,0 +1,171 @@
|
|||
"""Configuration parsing utilities.
|
||||
|
||||
Provides utilities for parsing model configurations, inferring parameters,
|
||||
and handling architecture-specific settings. Uses UK English spelling
|
||||
conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from helpers.models.conversion import GGUFParameters, ModelConfig, VisionConfig
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ConfigParser:
|
||||
"""Parses and transforms model configuration files.
|
||||
|
||||
Handles loading of HuggingFace config.json files, parameter inference,
|
||||
and conversion to GGUF-compatible formats. Provides sensible defaults
|
||||
for missing values and architecture-specific handling.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise ConfigParser."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def load_model_config(self, model_path: Path) -> ModelConfig:
|
||||
"""Load model configuration from config.json file.
|
||||
|
||||
Reads the standard HuggingFace config.json file and parses it into
|
||||
a structured ModelConfig instance with proper type validation. Handles
|
||||
vision model configurations and provides sensible defaults for missing values.
|
||||
|
||||
Returns:
|
||||
Parsed ModelConfig instance.
|
||||
"""
|
||||
config_file = model_path / "config.json"
|
||||
raw_config = self.fs.load_json_config(config_file)
|
||||
|
||||
# Parse vision config if present
|
||||
vision_config = None
|
||||
if "vision_config" in raw_config:
|
||||
vision_config = VisionConfig(**raw_config["vision_config"])
|
||||
|
||||
# Create ModelConfig with parsed values
|
||||
return ModelConfig(
|
||||
architectures=raw_config.get("architectures", ["Unknown"]),
|
||||
model_type=raw_config.get("model_type", "unknown"),
|
||||
vocab_size=raw_config.get("vocab_size", 32000),
|
||||
max_position_embeddings=raw_config.get("max_position_embeddings", 2048),
|
||||
hidden_size=raw_config.get("hidden_size", 4096),
|
||||
num_hidden_layers=raw_config.get("num_hidden_layers", 32),
|
||||
intermediate_size=raw_config.get("intermediate_size", 11008),
|
||||
num_attention_heads=raw_config.get("num_attention_heads", 32),
|
||||
num_key_value_heads=raw_config.get("num_key_value_heads"),
|
||||
rope_theta=raw_config.get("rope_theta", 10000.0),
|
||||
rope_scaling=raw_config.get("rope_scaling"),
|
||||
rms_norm_eps=raw_config.get("rms_norm_eps", 1e-5),
|
||||
vision_config=vision_config,
|
||||
)
|
||||
|
||||
def infer_gguf_parameters(self, config: ModelConfig) -> GGUFParameters:
|
||||
"""Infer GGUF parameters from model configuration.
|
||||
|
||||
Translates HuggingFace model configuration to GGUF parameter format,
|
||||
providing sensible defaults for missing values and handling various
|
||||
architecture conventions.
|
||||
|
||||
Args:
|
||||
config: Parsed ModelConfig instance.
|
||||
|
||||
Returns:
|
||||
GGUFParameters with inferred values.
|
||||
"""
|
||||
# Calculate derived parameters
|
||||
num_heads = config.num_attention_heads
|
||||
embedding_length = config.hidden_size
|
||||
rope_dimension_count = embedding_length // num_heads
|
||||
|
||||
# Handle KV heads (for GQA models)
|
||||
num_kv_heads = config.num_key_value_heads or num_heads
|
||||
|
||||
# Create GGUFParameters using dict with aliases
|
||||
params_dict = {
|
||||
"vocab_size": config.vocab_size,
|
||||
"context_length": config.max_position_embeddings,
|
||||
"embedding_length": embedding_length,
|
||||
"block_count": config.num_hidden_layers,
|
||||
"feed_forward_length": config.intermediate_size,
|
||||
"attention.head_count": num_heads,
|
||||
"attention.head_count_kv": num_kv_heads,
|
||||
"attention.layer_norm_rms_epsilon": config.rms_norm_eps,
|
||||
"rope.freq_base": config.rope_theta,
|
||||
"rope.dimension_count": rope_dimension_count,
|
||||
}
|
||||
|
||||
params = GGUFParameters.model_validate(params_dict)
|
||||
|
||||
# Add RoPE scaling if present
|
||||
if config.rope_scaling:
|
||||
params.rope_scaling_type = config.rope_scaling.get("type", "linear")
|
||||
params.rope_scaling_factor = config.rope_scaling.get("factor", 1.0)
|
||||
|
||||
return params
|
||||
|
||||
@staticmethod
|
||||
def get_architecture_mapping(architecture: str) -> str:
|
||||
"""Map architecture names to known GGUF architectures.
|
||||
|
||||
Provides fallback mappings for architectures not directly supported
|
||||
by GGUF, mapping them to similar known architectures.
|
||||
|
||||
Args:
|
||||
architecture: Original architecture name from config.
|
||||
|
||||
Returns:
|
||||
GGUF-compatible architecture name.
|
||||
"""
|
||||
# Architecture mappings to known GGUF types
|
||||
mappings = {
|
||||
"DotsOCRForCausalLM": "qwen2", # Similar architecture
|
||||
"GptOssForCausalLM": "llama", # Use llama as fallback
|
||||
"MistralForCausalLM": "llama", # Mistral is llama-like
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
"LlamaForCausalLM": "llama",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Phi3ForCausalLM": "phi3",
|
||||
# Add more mappings as needed
|
||||
}
|
||||
|
||||
return mappings.get(architecture, "llama") # Default to llama
|
||||
|
||||
@staticmethod
|
||||
def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
|
||||
"""Load tokeniser configuration from model directory.
|
||||
|
||||
Reads tokenizer_config.json to extract special token IDs and
|
||||
other tokenisation parameters.
|
||||
|
||||
Args:
|
||||
model_path: Path to model directory.
|
||||
|
||||
Returns:
|
||||
Tokeniser configuration dictionary.
|
||||
"""
|
||||
fs = FilesystemService()
|
||||
tokeniser_config_path = model_path / "tokenizer_config.json"
|
||||
|
||||
if not tokeniser_config_path.exists():
|
||||
# Return defaults if no config found
|
||||
return {
|
||||
"bos_token_id": 1,
|
||||
"eos_token_id": 2,
|
||||
"unk_token_id": 0,
|
||||
"pad_token_id": 0,
|
||||
}
|
||||
|
||||
config = fs.load_json_config(tokeniser_config_path)
|
||||
|
||||
# Extract token IDs with defaults
|
||||
return {
|
||||
"bos_token_id": config.get("bos_token_id", 1),
|
||||
"eos_token_id": config.get("eos_token_id", 2),
|
||||
"unk_token_id": config.get("unk_token_id", 0),
|
||||
"pad_token_id": config.get("pad_token_id", 0),
|
||||
"model_type": config.get("model_type", "llama"),
|
||||
}
|
196
helpers/utils/tensor_mapping.py
Normal file
196
helpers/utils/tensor_mapping.py
Normal file
|
@ -0,0 +1,196 @@
|
|||
"""Tensor mapping and URL parsing utilities.
|
||||
|
||||
Provides utilities for mapping tensor names between different formats,
|
||||
parsing model URLs, and handling architecture-specific conversions.
|
||||
Uses UK English spelling conventions throughout.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import ClassVar
|
||||
|
||||
from helpers.models.quantisation import ModelSource, URLType
|
||||
|
||||
|
||||
class TensorMapper:
|
||||
"""Maps tensor names between HuggingFace and GGUF conventions.
|
||||
|
||||
Provides flexible tensor name translation supporting direct mappings,
|
||||
layer-aware transformations, and architecture-specific overrides.
|
||||
Handles both simple renames and complex pattern-based conversions.
|
||||
"""
|
||||
|
||||
# Common direct mappings across architectures
|
||||
DIRECT_MAPPINGS: ClassVar[dict[str, str]] = {
|
||||
"model.embed_tokens.weight": "token_embd.weight",
|
||||
"model.norm.weight": "output_norm.weight",
|
||||
"lm_head.weight": "output.weight",
|
||||
}
|
||||
|
||||
# Layer component patterns for transformer blocks
|
||||
LAYER_PATTERNS: ClassVar[dict[str, str]] = {
|
||||
"self_attn.q_proj.weight": "attn_q.weight",
|
||||
"self_attn.q_proj.bias": "attn_q.bias",
|
||||
"self_attn.k_proj.weight": "attn_k.weight",
|
||||
"self_attn.k_proj.bias": "attn_k.bias",
|
||||
"self_attn.v_proj.weight": "attn_v.weight",
|
||||
"self_attn.v_proj.bias": "attn_v.bias",
|
||||
"self_attn.o_proj": "attn_output.weight",
|
||||
"mlp.gate_proj": "ffn_gate.weight",
|
||||
"mlp.up_proj": "ffn_up.weight",
|
||||
"mlp.down_proj": "ffn_down.weight",
|
||||
"input_layernorm": "attn_norm.weight",
|
||||
"post_attention_layernorm": "ffn_norm.weight",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def map_tensor_name(cls, original_name: str) -> str | None:
|
||||
"""Map original tensor name to GGUF format.
|
||||
|
||||
Translates HuggingFace tensor naming to GGUF format, handling embeddings,
|
||||
attention layers, feed-forward networks, and normalisation layers. Uses
|
||||
layer-aware mapping for transformer blocks whilst maintaining consistency
|
||||
across different model architectures.
|
||||
|
||||
Returns:
|
||||
GGUF tensor name, or None if unmappable.
|
||||
"""
|
||||
# Check direct mappings first
|
||||
if original_name in cls.DIRECT_MAPPINGS:
|
||||
return cls.DIRECT_MAPPINGS[original_name]
|
||||
|
||||
# Handle layer-specific tensors
|
||||
if ".layers." in original_name:
|
||||
return cls._map_layer_tensor(original_name)
|
||||
|
||||
# Return None for unmapped tensors
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _map_layer_tensor(cls, tensor_name: str) -> str | None:
|
||||
"""Map layer-specific tensor names.
|
||||
|
||||
Handles tensors within transformer layers, extracting layer indices
|
||||
and mapping component names to GGUF conventions.
|
||||
|
||||
Args:
|
||||
tensor_name: Layer tensor name containing .layers.N. pattern.
|
||||
|
||||
Returns:
|
||||
Mapped GGUF tensor name, or None if unmappable.
|
||||
"""
|
||||
# Extract layer number
|
||||
parts = tensor_name.split(".")
|
||||
layer_idx = None
|
||||
for i, part in enumerate(parts):
|
||||
if part == "layers" and i + 1 < len(parts):
|
||||
layer_idx = parts[i + 1]
|
||||
break
|
||||
|
||||
if layer_idx is None:
|
||||
return None
|
||||
|
||||
# Check each pattern
|
||||
for pattern, replacement in cls.LAYER_PATTERNS.items():
|
||||
if pattern in tensor_name:
|
||||
return f"blk.{layer_idx}.{replacement}"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class URLParser:
|
||||
"""Parses and validates model URLs from various sources.
|
||||
|
||||
Handles HuggingFace URLs, Ollama-style GGUF references, and other
|
||||
model source formats. Extracts metadata including author, model name,
|
||||
and file patterns for appropriate download strategies.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def parse(url: str) -> ModelSource:
|
||||
"""Parse URL and extract model source information.
|
||||
|
||||
Analyses URL format to determine source type and extract relevant
|
||||
metadata for model download and processing.
|
||||
|
||||
Args:
|
||||
url: Model URL in supported format.
|
||||
|
||||
Returns:
|
||||
ModelSource with parsed information.
|
||||
|
||||
Raises:
|
||||
ValueError: If URL format is not recognised.
|
||||
"""
|
||||
if not url:
|
||||
msg = "URL cannot be empty"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Try Ollama-style GGUF URL first (hf.co/author/model:pattern)
|
||||
ollama_match = re.match(r"^hf\.co/([^:]+):(.+)$", url)
|
||||
if ollama_match:
|
||||
source_model = ollama_match.group(1)
|
||||
gguf_pattern = ollama_match.group(2)
|
||||
return URLParser._create_model_source(
|
||||
url,
|
||||
URLType.OLLAMA_GGUF,
|
||||
source_model,
|
||||
gguf_file_pattern=gguf_pattern,
|
||||
is_gguf_repo=True,
|
||||
)
|
||||
|
||||
# Try regular HuggingFace URL
|
||||
hf_match = re.match(r"https://huggingface\.co/([^/]+/[^/?]+)", url)
|
||||
if hf_match:
|
||||
source_model = hf_match.group(1)
|
||||
return URLParser._create_model_source(
|
||||
url, URLType.HUGGINGFACE, source_model, is_gguf_repo=False
|
||||
)
|
||||
|
||||
msg = (
|
||||
"Invalid URL format\n"
|
||||
"Supported formats:\n"
|
||||
" - https://huggingface.co/username/model-name\n"
|
||||
" - hf.co/username/model-name-GGUF:F16"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
@staticmethod
|
||||
def _create_model_source(
|
||||
url: str,
|
||||
url_type: URLType,
|
||||
source_model: str,
|
||||
gguf_file_pattern: str | None = None,
|
||||
is_gguf_repo: bool = False,
|
||||
) -> ModelSource:
|
||||
"""Create ModelSource with parsed information.
|
||||
|
||||
Constructs a ModelSource instance with extracted metadata,
|
||||
handling author/model name splitting and GGUF suffix removal.
|
||||
|
||||
Args:
|
||||
url: Original URL.
|
||||
url_type: Type of URL (HuggingFace or Ollama GGUF).
|
||||
source_model: Repository identifier (author/model).
|
||||
gguf_file_pattern: Optional GGUF file pattern.
|
||||
is_gguf_repo: Whether this is a GGUF repository.
|
||||
|
||||
Returns:
|
||||
Configured ModelSource instance.
|
||||
"""
|
||||
author, model_name = source_model.split("/", 1)
|
||||
|
||||
# Strip -GGUF suffix for GGUF repos
|
||||
if is_gguf_repo and model_name.endswith("-GGUF"):
|
||||
model_name = model_name[:-5]
|
||||
|
||||
return ModelSource(
|
||||
url=url,
|
||||
url_type=url_type,
|
||||
source_model=source_model,
|
||||
original_author=author,
|
||||
model_name=model_name,
|
||||
gguf_file_pattern=gguf_file_pattern,
|
||||
is_gguf_repo=is_gguf_repo,
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue