141 lines
5.3 KiB
Python
141 lines
5.3 KiB
Python
"""Quantisation engine for model processing.
|
|
|
|
Handles the actual quantisation process with configurable methods,
|
|
supporting multiple quantisation backends and fallback strategies.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import traceback
|
|
from typing import TYPE_CHECKING
|
|
|
|
from helpers.filesystem import FilesystemService
|
|
from helpers.ggml import GGMLQuantiser
|
|
from helpers.llama_cpp import QuantisationExecutor
|
|
from helpers.logger import logger
|
|
from helpers.models.quantisation import QuantisationResult, QuantisationType
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
from helpers.models.quantisation import (
|
|
QuantisationContext,
|
|
)
|
|
|
|
|
|
class QuantisationEngine:
|
|
"""Handles the actual quantisation process with configurable methods.
|
|
|
|
Provides flexible quantisation execution supporting multiple tensor
|
|
precision configurations, importance matrices, and fallback strategies.
|
|
Uses direct llama.cpp binary execution with proper tensor overrides.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise quantisation engine."""
|
|
self.fs = FilesystemService()
|
|
self.executor = QuantisationExecutor()
|
|
self.ggml_quantiser = GGMLQuantiser()
|
|
|
|
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
|
"""Perform quantisation using the specified configuration.
|
|
|
|
Executes quantisation using direct llama.cpp binary with proper
|
|
tensor override flags for L and XL variants. Falls back to GGML
|
|
for basic types when architecture is unsupported. Processes the
|
|
quantisation context containing all required parameters and settings.
|
|
|
|
Returns:
|
|
QuantisationResult with success status and file information.
|
|
"""
|
|
logger.info(
|
|
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
|
)
|
|
|
|
output_path = context.get_output_path()
|
|
|
|
# Check input file exists and is readable
|
|
if not context.f16_model_path.exists():
|
|
error_msg = f"Input model file does not exist: {context.f16_model_path}"
|
|
logger.error(f"❌ {error_msg}")
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(context.config.name),
|
|
success=False,
|
|
error_message=error_msg,
|
|
)
|
|
|
|
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
|
logger.info(f"📝 Source: {context.f16_model_path}")
|
|
logger.info(f"📝 Target: {output_path}")
|
|
|
|
# Determine if this is a basic type that can use GGML
|
|
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
|
is_basic_type = context.config.name in basic_types
|
|
|
|
try:
|
|
# Try llama.cpp first for all types
|
|
logger.info("🔧 Using llama.cpp binary for quantisation...")
|
|
|
|
success = self.executor.execute_quantisation(
|
|
context.f16_model_path, output_path, context.config, context.imatrix_path
|
|
)
|
|
|
|
if success:
|
|
return self._create_success_result(context.config.name, output_path, "llama.cpp")
|
|
|
|
# Check if this was an architecture error and we can use GGML fallback
|
|
if (
|
|
hasattr(self.executor, "last_error")
|
|
and self.executor.last_error == "unsupported_architecture"
|
|
and is_basic_type
|
|
):
|
|
logger.info("🔄 Architecture unsupported - using GGML implementation...")
|
|
|
|
success = self.ggml_quantiser.try_alternative_quantisation(
|
|
context.f16_model_path, output_path, context.config.name
|
|
)
|
|
|
|
if success:
|
|
return self._create_success_result(
|
|
context.config.name, output_path, "GGML numpy"
|
|
)
|
|
|
|
logger.error(f"❌ {context.config.name} quantisation failed")
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(context.config.name),
|
|
success=False,
|
|
error_message="Quantisation failed via Python API",
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
|
|
logger.error("Exception traceback:")
|
|
for line in traceback.format_exc().splitlines():
|
|
logger.error(f" {line}")
|
|
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(context.config.name),
|
|
success=False,
|
|
error_message=f"Exception during quantisation: {e!s}",
|
|
)
|
|
|
|
def _create_success_result(
|
|
self, quant_type: str, output_path: Path, method_used: str
|
|
) -> QuantisationResult:
|
|
"""Create successful quantisation result with file metadata.
|
|
|
|
Constructs a successful quantisation result containing file size
|
|
information and method details. Uses the quantisation type, output
|
|
path, and method information to generate comprehensive result metadata.
|
|
|
|
Returns:
|
|
QuantisationResult with file path and size information.
|
|
"""
|
|
file_size = self.fs.get_file_size(output_path)
|
|
return QuantisationResult(
|
|
quantisation_type=QuantisationType(quant_type),
|
|
success=True,
|
|
file_path=output_path,
|
|
file_size=file_size,
|
|
method_used=method_used,
|
|
)
|