846 lines
36 KiB
Python
846 lines
36 KiB
Python
"""Quantisation orchestration service.
|
|
|
|
High-level orchestration of the complete quantisation workflow from model
|
|
acquisition through processing to upload. Manages parallel processing,
|
|
status tracking, and cleanup operations for efficient resource utilisation.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import gc
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import traceback
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
|
|
import psutil
|
|
|
|
from helpers.config.quantisation_configs import (
|
|
DEFAULT_QUANTISATION_TYPES,
|
|
QUANTISATION_CONFIGS,
|
|
SUPPORTED_QUANTISATION_TYPES,
|
|
)
|
|
from helpers.logger import logger
|
|
from helpers.models.quantisation import (
|
|
ModelSource,
|
|
QuantisationContext,
|
|
QuantisationResult,
|
|
QuantisationType,
|
|
)
|
|
from helpers.services.huggingface import ReadmeGenerator
|
|
from helpers.services.imatrix_generator import IMatrixGenerator
|
|
from helpers.services.llama_cpp import IMatrixHandler
|
|
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
|
from helpers.utils.rate_limiter import ReadmeRateLimiter
|
|
from helpers.utils.tensor_mapping import URLParser
|
|
|
|
if TYPE_CHECKING:
|
|
from types import FrameType
|
|
from typing import Any
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class QuantisationOrchestrator:
|
|
"""Orchestrates the complete quantisation workflow.
|
|
|
|
Uses dataclass with slots for efficient memory usage and dependency injection
|
|
for modular service interaction following SOLID principles.
|
|
"""
|
|
|
|
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
|
use_imatrix: bool = True
|
|
no_upload: bool = False
|
|
custom_profiles: list[str] | None = None
|
|
|
|
# Service dependencies with factory defaults
|
|
url_parser: URLParser = field(default_factory=URLParser)
|
|
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
|
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
|
|
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
|
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
|
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
|
|
|
# Computed properties
|
|
models_dir: Path = field(init=False)
|
|
model_manager: ModelManager = field(init=False)
|
|
readme_limiter: ReadmeRateLimiter = field(init=False)
|
|
|
|
def __post_init__(self) -> None:
|
|
"""Initialise computed properties after dataclass construction."""
|
|
self.models_dir = self.work_dir / "models"
|
|
self.model_manager = ModelManager(self.models_dir)
|
|
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
|
|
|
|
# Set up signal handlers for graceful exit tracking
|
|
self._setup_signal_handlers()
|
|
|
|
def _setup_signal_handlers(self) -> None:
|
|
"""Set up signal handlers to catch unexpected exits."""
|
|
|
|
def signal_handler(signum: int, frame: FrameType | None) -> None:
|
|
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
|
|
logger.error("Stack trace at signal:")
|
|
if frame:
|
|
for line in traceback.format_stack(frame):
|
|
logger.error(f" {line.strip()}")
|
|
logger.error("Exiting due to signal")
|
|
sys.exit(1)
|
|
|
|
# Handle common termination signals
|
|
for sig in [signal.SIGINT, signal.SIGTERM]:
|
|
signal.signal(sig, signal_handler)
|
|
|
|
def _check_architecture_support(self, f16_model_path: Path) -> bool:
|
|
"""Check if the model architecture is supported by llama.cpp.
|
|
|
|
Args:
|
|
f16_model_path: Path to the F16 GGUF model
|
|
|
|
Returns:
|
|
True if architecture is NOT supported (K-quants should be skipped)
|
|
"""
|
|
try:
|
|
# Try a simple quantization with llama.cpp to check support
|
|
result = subprocess.run(
|
|
[
|
|
".cache/llm-gguf-tools/binaries/llama-quantize",
|
|
str(f16_model_path),
|
|
"/dev/null",
|
|
"Q4_K_M",
|
|
],
|
|
check=False,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
|
|
# Check if it failed due to unknown architecture
|
|
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
|
|
except Exception:
|
|
# If we can't determine, assume it might work
|
|
return False
|
|
|
|
def get_quantisation_types(self) -> list[QuantisationType]:
|
|
"""Get the quantisation types to use for this run.
|
|
|
|
Returns:
|
|
List of QuantisationType enums to process.
|
|
"""
|
|
if self.custom_profiles:
|
|
# Parse custom profiles from strings to QuantisationType
|
|
result = []
|
|
for profile_str in self.custom_profiles:
|
|
try:
|
|
profile = QuantisationType(profile_str.upper())
|
|
if profile in SUPPORTED_QUANTISATION_TYPES:
|
|
result.append(profile)
|
|
else:
|
|
logger.warning(f"Profile {profile_str} is not supported, skipping")
|
|
except ValueError:
|
|
logger.warning(f"Invalid profile {profile_str}, skipping")
|
|
return result or DEFAULT_QUANTISATION_TYPES
|
|
return DEFAULT_QUANTISATION_TYPES
|
|
|
|
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
|
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
|
|
|
Returns:
|
|
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
|
|
|
Raises:
|
|
KeyboardInterrupt: If the user interrupts the quantisation process.
|
|
"""
|
|
logger.info("Starting Bartowski quantisation process...")
|
|
logger.debug(f"DEBUG: Input URL: {url}")
|
|
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
|
|
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
|
|
logger.debug(f"DEBUG: No upload: {self.no_upload}")
|
|
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
|
|
|
|
try:
|
|
# Setup and preparation
|
|
logger.debug("DEBUG: Starting environment setup...")
|
|
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
|
|
logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
|
|
|
|
# Create initial repository
|
|
logger.debug("DEBUG: Creating initial repository...")
|
|
self._create_initial_repository(model_source, output_repo)
|
|
logger.debug("DEBUG: Initial repository created")
|
|
|
|
# Execute all quantisations
|
|
logger.debug("DEBUG: Starting quantisation execution...")
|
|
results = self._execute_quantisations(
|
|
model_source, f16_model_path, imatrix_path, output_repo
|
|
)
|
|
logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
|
|
|
|
# Cleanup
|
|
logger.debug("DEBUG: Starting cleanup...")
|
|
self._cleanup_files(f16_model_path, model_source)
|
|
logger.debug("DEBUG: Cleanup complete")
|
|
|
|
self._print_completion_summary(model_source, results, output_repo)
|
|
except KeyboardInterrupt:
|
|
logger.error("❌ Process interrupted by user (Ctrl+C)")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"❌ Critical error in quantisation workflow: {e}")
|
|
logger.error("Full traceback:")
|
|
for line in traceback.format_exc().splitlines():
|
|
logger.error(f" {line}")
|
|
raise
|
|
finally:
|
|
# Always flush pending README updates before exiting
|
|
self.readme_limiter.flush()
|
|
|
|
return results
|
|
|
|
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
|
|
"""Setup environment and prepare model for quantisation.
|
|
|
|
Returns:
|
|
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
|
|
"""
|
|
model_source = self.url_parser.parse(url)
|
|
self._print_model_info(model_source)
|
|
|
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
|
f16_model_path = self.model_manager.prepare_model(model_source)
|
|
|
|
output_repo = (
|
|
f"{self.uploader.get_username()}/"
|
|
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
|
)
|
|
|
|
imatrix_path = None
|
|
if self.use_imatrix:
|
|
logger.info("Checking for importance matrix (imatrix)...")
|
|
model_dir = self.models_dir / model_source.model_name
|
|
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
|
|
|
|
# If no imatrix found, offer to generate or provide one
|
|
if not imatrix_path:
|
|
# First offer to generate
|
|
imatrix_path = self.imatrix_generator.prompt_for_generation(
|
|
model_source, model_dir, f16_model_path
|
|
)
|
|
|
|
# If generation was skipped, offer to provide existing one
|
|
if not imatrix_path:
|
|
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
|
|
|
|
return model_source, f16_model_path, imatrix_path, output_repo
|
|
|
|
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
|
"""Create initial repository with planned quantisations."""
|
|
logger.info("Creating initial README with planned quantisations...")
|
|
quantisation_types = self.get_quantisation_types()
|
|
planned_results = {
|
|
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
|
for qt in quantisation_types
|
|
}
|
|
readme_path = self.readme_generator.generate(
|
|
model_source, planned_results, self.models_dir, output_repo
|
|
)
|
|
|
|
if not self.no_upload:
|
|
logger.info("Creating repository with planned quantisations...")
|
|
self.uploader.upload_readme(output_repo, readme_path)
|
|
else:
|
|
logger.info("Skipping repository creation (--no-upload specified)")
|
|
|
|
def _execute_quantisations(
|
|
self,
|
|
model_source: ModelSource,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
) -> dict[QuantisationType, QuantisationResult]:
|
|
"""Execute all quantisation types with parallel uploads.
|
|
|
|
Returns:
|
|
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
|
"""
|
|
results: dict[QuantisationType, QuantisationResult] = {}
|
|
|
|
quantisation_types = self.get_quantisation_types()
|
|
types_list = [qt.value for qt in quantisation_types]
|
|
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
|
|
|
|
# Check architecture support upfront
|
|
architecture_unsupported = self._check_architecture_support(f16_model_path)
|
|
|
|
if architecture_unsupported:
|
|
logger.warning("⚠️ Architecture not supported by llama.cpp - K-quants will be skipped")
|
|
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
|
|
|
|
# Pre-mark all K-quants as skipped
|
|
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
|
for quant_type in quantisation_types:
|
|
if quant_type.value not in basic_types:
|
|
results[quant_type] = QuantisationResult(
|
|
quantisation_type=quant_type,
|
|
success=False,
|
|
status="failed",
|
|
error_message="K-quant requires llama.cpp architecture support",
|
|
)
|
|
|
|
# Track F16 in results for status display (if we converted from SafeTensors)
|
|
if not model_source.is_gguf_repo:
|
|
# Get F16 file size
|
|
f16_size = "-"
|
|
if f16_model_path.exists():
|
|
size_bytes = f16_model_path.stat().st_size
|
|
size_gb = size_bytes / (1024**3)
|
|
f16_size = f"{size_gb:.1f}GB"
|
|
|
|
# Create a simple object for F16 tracking (not a QuantisationResult)
|
|
# since F16 isn't a quantisation type in our enum
|
|
f16_result = type(
|
|
"F16Result",
|
|
(),
|
|
{
|
|
"quantisation_type": "F16",
|
|
"success": True,
|
|
"status": "planned",
|
|
"file_path": f16_model_path,
|
|
"file_size": f16_size,
|
|
},
|
|
)()
|
|
results[QuantisationType.F16] = f16_result
|
|
|
|
# Process with parallel uploads - quantise sequentially but upload in background
|
|
upload_futures: list[Any] = []
|
|
architecture_unsupported = False
|
|
|
|
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
|
|
# Start F16 upload first if we have one
|
|
if (
|
|
not model_source.is_gguf_repo
|
|
and not self.no_upload
|
|
and QuantisationType.F16 in results
|
|
):
|
|
f16_result = results[QuantisationType.F16]
|
|
if f16_result.file_path and f16_result.file_path.exists():
|
|
logger.info("Starting parallel upload of F16 GGUF...")
|
|
f16_result.status = "uploading"
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
upload_future = upload_executor.submit(
|
|
self._upload_f16_and_cleanup,
|
|
output_repo,
|
|
f16_result.file_path,
|
|
model_source,
|
|
results,
|
|
)
|
|
upload_futures.append(upload_future)
|
|
for i, quant_type in enumerate(quantisation_types, 1):
|
|
# Skip if already marked as failed (e.g., K-quants for unsupported arch)
|
|
if quant_type in results and results[quant_type].status == "failed":
|
|
logger.info(
|
|
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
|
|
)
|
|
continue
|
|
|
|
logger.info(
|
|
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
|
|
)
|
|
logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
|
|
logger.debug(f"DEBUG: Current type: {quant_type.value}")
|
|
logger.debug(f"DEBUG: Results so far: {len(results)} completed")
|
|
|
|
try:
|
|
result = self._process_single_quantisation(
|
|
quant_type,
|
|
model_source,
|
|
f16_model_path,
|
|
imatrix_path,
|
|
output_repo,
|
|
results,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
results[quant_type] = result
|
|
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
|
|
|
|
# Check if this failed due to unsupported architecture
|
|
if (
|
|
not result.success
|
|
and hasattr(self.quantisation_engine.executor, "last_error")
|
|
and self.quantisation_engine.executor.last_error
|
|
== "unsupported_architecture"
|
|
):
|
|
logger.warning(
|
|
"⚠️ Architecture not supported by llama.cpp - K-quants will be skipped"
|
|
)
|
|
logger.info(
|
|
"💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated"
|
|
)
|
|
architecture_unsupported = True
|
|
# Update the current result to also show as skipped
|
|
result.error_message = "Architecture not supported by llama.cpp"
|
|
# Update README immediately to show remaining K-quants as skipped
|
|
# But don't mark basic types as failed - they can still use GGML
|
|
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
|
for remaining_quant_type in quantisation_types[i:]:
|
|
if remaining_quant_type not in results:
|
|
# Only mark K-quants as failed due to architecture
|
|
if remaining_quant_type.value not in basic_types:
|
|
results[remaining_quant_type] = QuantisationResult(
|
|
quantisation_type=remaining_quant_type,
|
|
success=False,
|
|
status="failed",
|
|
error_message="K-quant requires llama.cpp architecture support",
|
|
)
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
# Force cleanup between quantisations
|
|
gc.collect()
|
|
logger.debug("DEBUG: Garbage collection completed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
|
|
logger.error("Exception traceback:")
|
|
for line in traceback.format_exc().splitlines():
|
|
logger.error(f" {line}")
|
|
results[quant_type] = QuantisationResult(
|
|
quantisation_type=quant_type,
|
|
success=False,
|
|
status="failed",
|
|
error_message=str(e),
|
|
)
|
|
|
|
# Force cleanup after error
|
|
gc.collect()
|
|
|
|
# Wait for all uploads to complete before returning
|
|
self._wait_for_uploads(upload_futures)
|
|
|
|
# Final README update to ensure all statuses are accurate
|
|
if not self.no_upload and upload_futures:
|
|
logger.info("Updating README with final status...")
|
|
final_readme = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, final_readme)
|
|
|
|
return results
|
|
|
|
def _process_single_quantisation(
|
|
self,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> QuantisationResult:
|
|
"""Process a single quantisation type.
|
|
|
|
Returns:
|
|
QuantisationResult: Result of the quantisation attempt.
|
|
"""
|
|
try:
|
|
logger.info(f"Starting {quant_type.value} quantisation...")
|
|
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
|
|
config = QUANTISATION_CONFIGS[quant_type]
|
|
logger.debug(f"DEBUG: Config loaded: {config.name}")
|
|
|
|
# Update status to processing
|
|
logger.debug("DEBUG: Creating initial quantisation result")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "processing"
|
|
results[quant_type] = result
|
|
|
|
logger.debug("DEBUG: Updating README status")
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
# Perform quantisation
|
|
logger.debug("DEBUG: Creating quantisation context")
|
|
context = QuantisationContext(
|
|
f16_model_path=f16_model_path,
|
|
model_source=model_source,
|
|
config=config,
|
|
models_dir=self.models_dir,
|
|
imatrix_path=imatrix_path,
|
|
)
|
|
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
|
|
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
|
|
logger.debug("DEBUG: Calling quantisation engine...")
|
|
result = self.quantisation_engine.quantise(context)
|
|
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
|
|
|
|
self._handle_quantisation_result(
|
|
result,
|
|
quant_type,
|
|
model_source,
|
|
results,
|
|
output_repo,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
except Exception as e:
|
|
return self._handle_quantisation_error(
|
|
e, quant_type, model_source, results, output_repo
|
|
)
|
|
else:
|
|
return result
|
|
|
|
def _process_single_quantisation_sequential(
|
|
self,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
) -> QuantisationResult:
|
|
"""Process a single quantisation type sequentially with immediate upload.
|
|
|
|
Returns:
|
|
QuantisationResult: Result of the quantisation attempt.
|
|
"""
|
|
# Force cleanup before starting new quantisation
|
|
gc.collect()
|
|
|
|
# Log system state before quantisation
|
|
process = psutil.Process()
|
|
logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
|
|
logger.debug(f"DEBUG: Process alive: {process.is_running()}")
|
|
logger.debug(f"DEBUG: PID: {process.pid}")
|
|
logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
|
|
logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
|
|
logger.debug(f"DEBUG: Threads: {process.num_threads()}")
|
|
logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
|
|
|
|
try:
|
|
logger.info(f"Starting {quant_type.value} quantisation...")
|
|
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
|
|
config = QUANTISATION_CONFIGS[quant_type]
|
|
logger.debug(f"DEBUG: Config loaded: {config.name}")
|
|
|
|
# Update status to processing
|
|
logger.debug("DEBUG: Creating initial quantisation result")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "processing"
|
|
results[quant_type] = result
|
|
|
|
logger.debug("DEBUG: Updating README status")
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
# Perform quantisation
|
|
logger.debug("DEBUG: Creating quantisation context")
|
|
context = QuantisationContext(
|
|
f16_model_path=f16_model_path,
|
|
model_source=model_source,
|
|
config=config,
|
|
models_dir=self.models_dir,
|
|
imatrix_path=imatrix_path,
|
|
)
|
|
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
|
|
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
|
|
logger.debug("DEBUG: Calling quantisation engine...")
|
|
result = self.quantisation_engine.quantise(context)
|
|
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
|
|
|
|
if result.success and result.file_path:
|
|
# Upload immediately (if not in no-upload mode)
|
|
if not self.no_upload:
|
|
logger.info(f"Uploading {quant_type.value}...")
|
|
try:
|
|
self.uploader.upload_model_file(output_repo, result.file_path)
|
|
logger.info(f"Upload of {quant_type.value} completed successfully")
|
|
|
|
# Clean up file after successful upload
|
|
logger.info(f"Removing {result.file_path.name} to save disk space...")
|
|
result.file_path.unlink()
|
|
|
|
result.status = "completed"
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
except Exception as upload_error:
|
|
logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
|
|
result.status = "failed"
|
|
result.error_message = str(upload_error)
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
# Keep file if upload failed
|
|
else:
|
|
# No upload mode - just mark as completed
|
|
result.status = "completed"
|
|
logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
|
|
else:
|
|
result.status = "failed"
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
except Exception as e:
|
|
logger.error(f"Error processing {quant_type.value}: {e}")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "failed"
|
|
result.error_message = str(e)
|
|
|
|
try:
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
except Exception as readme_error:
|
|
logger.error(f"Failed to update README after error: {readme_error}")
|
|
# Force cleanup after error
|
|
gc.collect()
|
|
return result
|
|
else:
|
|
# Force cleanup after quantisation
|
|
gc.collect()
|
|
return result
|
|
|
|
def _handle_quantisation_result(
|
|
self,
|
|
result: QuantisationResult,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> None:
|
|
"""Handle successful or failed quantisation result."""
|
|
if result.success and result.file_path:
|
|
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
|
logger.info(f"Starting parallel upload of {quant_str}...")
|
|
upload_future = upload_executor.submit(
|
|
self._upload_and_cleanup,
|
|
output_repo,
|
|
result.file_path,
|
|
quant_type,
|
|
model_source,
|
|
results,
|
|
)
|
|
upload_futures.append(upload_future)
|
|
result.file_path = None # Mark as being uploaded
|
|
result.status = "uploading"
|
|
else:
|
|
result.status = "failed"
|
|
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
def _handle_quantisation_error(
|
|
self,
|
|
error: Exception,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> QuantisationResult:
|
|
"""Handle quantisation processing error.
|
|
|
|
Returns:
|
|
QuantisationResult: Failed quantisation result with error information.
|
|
"""
|
|
logger.error(f"Error processing {quant_type.value}: {error}")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "failed"
|
|
result.error_message = str(error)
|
|
|
|
try:
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
except Exception as readme_error:
|
|
logger.error(f"Failed to update README after error: {readme_error}")
|
|
|
|
return result
|
|
|
|
def _update_readme_status(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Update README with current quantisation status using rate limiting."""
|
|
if not self.no_upload:
|
|
# Use rate limiter to batch updates
|
|
self.readme_limiter.request_update(
|
|
self._do_readme_update,
|
|
model_source,
|
|
results,
|
|
output_repo,
|
|
)
|
|
|
|
def _do_readme_update(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Actually perform the README update (called by rate limiter)."""
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
def _wait_for_uploads(self, upload_futures: list) -> None:
|
|
"""Wait for all parallel uploads to complete."""
|
|
if not upload_futures:
|
|
return
|
|
|
|
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
|
|
completed = 0
|
|
failed = 0
|
|
|
|
for future in upload_futures:
|
|
try:
|
|
future.result(timeout=300) # 5 minute timeout per upload
|
|
completed += 1
|
|
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
|
|
except Exception as e:
|
|
failed += 1
|
|
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
|
|
|
|
if failed > 0:
|
|
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
|
|
else:
|
|
logger.info(f"All {completed} uploads completed successfully")
|
|
|
|
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
|
"""Clean up temporary files after processing."""
|
|
if f16_model_path.exists():
|
|
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
|
f16_model_path.unlink()
|
|
|
|
if not model_source.is_gguf_repo:
|
|
self._cleanup_original_model(model_source)
|
|
|
|
def _cleanup_original_model(self, model_source: ModelSource) -> None:
|
|
"""Clean up original safetensors/PyTorch files after successful conversion."""
|
|
model_dir = self.models_dir / model_source.model_name
|
|
|
|
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
|
|
if pytorch_files:
|
|
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
|
|
for file in pytorch_files:
|
|
file.unlink()
|
|
|
|
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
|
|
|
def _upload_and_cleanup(
|
|
self,
|
|
output_repo: str,
|
|
file_path: Path,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
) -> None:
|
|
"""Upload file and clean up (runs in background thread)."""
|
|
try:
|
|
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
|
|
self.uploader.upload_model_file(output_repo, file_path)
|
|
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
|
|
|
|
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
|
|
file_path.unlink()
|
|
|
|
results[quant_type].status = "completed"
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
|
except Exception as e:
|
|
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
|
results[quant_type].status = "failed"
|
|
results[quant_type].error_message = str(e)
|
|
|
|
try:
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
except Exception as readme_error:
|
|
logger.error(
|
|
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
|
|
)
|
|
# Don't re-raise - let other uploads continue
|
|
|
|
def _upload_f16_and_cleanup(
|
|
self,
|
|
output_repo: str,
|
|
file_path: Path,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
) -> None:
|
|
"""Upload F16 file and clean up (runs in background thread)."""
|
|
try:
|
|
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
|
|
self.uploader.upload_model_file(output_repo, file_path)
|
|
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
|
|
|
|
# Don't delete F16 yet - we still need it for quantisations
|
|
# It will be deleted in _cleanup_files after all quantisations complete
|
|
|
|
results[QuantisationType.F16].status = "completed"
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
logger.info("[PARALLEL] F16 upload complete")
|
|
except Exception as e:
|
|
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
|
|
results[QuantisationType.F16].status = "failed"
|
|
results[QuantisationType.F16].error_message = str(e)
|
|
|
|
try:
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
except Exception as readme_error:
|
|
logger.error(
|
|
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
|
|
)
|
|
# Don't re-raise - let other uploads continue
|
|
|
|
def _print_model_info(self, model_source: ModelSource) -> None:
|
|
"""Print model information."""
|
|
logger.info(f"Source URL: {model_source.url}")
|
|
logger.info(f"Source model: {model_source.source_model}")
|
|
logger.info(f"Original author: {model_source.original_author}")
|
|
logger.info(f"Model name: {model_source.model_name}")
|
|
logger.info(f"Your HF username: {self.uploader.get_username()}")
|
|
logger.info(f"Working directory: {self.work_dir}")
|
|
|
|
def _print_completion_summary(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Print completion summary."""
|
|
successful_results = [r for r in results.values() if r.success]
|
|
|
|
if successful_results:
|
|
logger.info("Complete! Your quantised models are available at:")
|
|
logger.info(f" https://huggingface.co/{output_repo}")
|
|
logger.info("Model info:")
|
|
logger.info(f" - Source URL: {model_source.url}")
|
|
logger.info(f" - Original: {model_source.source_model}")
|
|
logger.info(
|
|
" - Method: "
|
|
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
|
)
|
|
logger.info(f" - Quantised: {output_repo}")
|
|
|
|
for result in successful_results:
|
|
if result.file_size:
|
|
filename = (
|
|
f"{model_source.original_author}-{model_source.model_name}-"
|
|
f"{result.quantisation_type}.gguf"
|
|
)
|
|
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
|
else:
|
|
logger.error(
|
|
"All quantisations failed - repository created with documentation "
|
|
"but no model files"
|
|
)
|
|
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|