llm-gguf-tools/helpers/services/orchestrator.py
2025-08-09 12:58:58 +01:00

846 lines
36 KiB
Python

"""Quantisation orchestration service.
High-level orchestration of the complete quantisation workflow from model
acquisition through processing to upload. Manages parallel processing,
status tracking, and cleanup operations for efficient resource utilisation.
"""
from __future__ import annotations
import gc
import signal
import subprocess
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
import psutil
from helpers.config.quantisation_configs import (
DEFAULT_QUANTISATION_TYPES,
QUANTISATION_CONFIGS,
SUPPORTED_QUANTISATION_TYPES,
)
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.imatrix_generator import IMatrixGenerator
from helpers.services.llama_cpp import IMatrixHandler
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.rate_limiter import ReadmeRateLimiter
from helpers.utils.tensor_mapping import URLParser
if TYPE_CHECKING:
from types import FrameType
from typing import Any
@dataclass(slots=True)
class QuantisationOrchestrator:
"""Orchestrates the complete quantisation workflow.
Uses dataclass with slots for efficient memory usage and dependency injection
for modular service interaction following SOLID principles.
"""
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
use_imatrix: bool = True
no_upload: bool = False
custom_profiles: list[str] | None = None
# Service dependencies with factory defaults
url_parser: URLParser = field(default_factory=URLParser)
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
# Computed properties
models_dir: Path = field(init=False)
model_manager: ModelManager = field(init=False)
readme_limiter: ReadmeRateLimiter = field(init=False)
def __post_init__(self) -> None:
"""Initialise computed properties after dataclass construction."""
self.models_dir = self.work_dir / "models"
self.model_manager = ModelManager(self.models_dir)
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
# Set up signal handlers for graceful exit tracking
self._setup_signal_handlers()
def _setup_signal_handlers(self) -> None:
"""Set up signal handlers to catch unexpected exits."""
def signal_handler(signum: int, frame: FrameType | None) -> None:
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
logger.error("Stack trace at signal:")
if frame:
for line in traceback.format_stack(frame):
logger.error(f" {line.strip()}")
logger.error("Exiting due to signal")
sys.exit(1)
# Handle common termination signals
for sig in [signal.SIGINT, signal.SIGTERM]:
signal.signal(sig, signal_handler)
def _check_architecture_support(self, f16_model_path: Path) -> bool:
"""Check if the model architecture is supported by llama.cpp.
Args:
f16_model_path: Path to the F16 GGUF model
Returns:
True if architecture is NOT supported (K-quants should be skipped)
"""
try:
# Try a simple quantization with llama.cpp to check support
result = subprocess.run(
[
".cache/llm-gguf-tools/binaries/llama-quantize",
str(f16_model_path),
"/dev/null",
"Q4_K_M",
],
check=False,
capture_output=True,
text=True,
timeout=5,
)
# Check if it failed due to unknown architecture
return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
except Exception:
# If we can't determine, assume it might work
return False
def get_quantisation_types(self) -> list[QuantisationType]:
"""Get the quantisation types to use for this run.
Returns:
List of QuantisationType enums to process.
"""
if self.custom_profiles:
# Parse custom profiles from strings to QuantisationType
result = []
for profile_str in self.custom_profiles:
try:
profile = QuantisationType(profile_str.upper())
if profile in SUPPORTED_QUANTISATION_TYPES:
result.append(profile)
else:
logger.warning(f"Profile {profile_str} is not supported, skipping")
except ValueError:
logger.warning(f"Invalid profile {profile_str}, skipping")
return result or DEFAULT_QUANTISATION_TYPES
return DEFAULT_QUANTISATION_TYPES
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
"""Main quantisation workflow orchestrating model processing from URL to upload.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
Raises:
KeyboardInterrupt: If the user interrupts the quantisation process.
"""
logger.info("Starting Bartowski quantisation process...")
logger.debug(f"DEBUG: Input URL: {url}")
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
logger.debug(f"DEBUG: No upload: {self.no_upload}")
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
try:
# Setup and preparation
logger.debug("DEBUG: Starting environment setup...")
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
# Create initial repository
logger.debug("DEBUG: Creating initial repository...")
self._create_initial_repository(model_source, output_repo)
logger.debug("DEBUG: Initial repository created")
# Execute all quantisations
logger.debug("DEBUG: Starting quantisation execution...")
results = self._execute_quantisations(
model_source, f16_model_path, imatrix_path, output_repo
)
logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
# Cleanup
logger.debug("DEBUG: Starting cleanup...")
self._cleanup_files(f16_model_path, model_source)
logger.debug("DEBUG: Cleanup complete")
self._print_completion_summary(model_source, results, output_repo)
except KeyboardInterrupt:
logger.error("❌ Process interrupted by user (Ctrl+C)")
raise
except Exception as e:
logger.error(f"❌ Critical error in quantisation workflow: {e}")
logger.error("Full traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
finally:
# Always flush pending README updates before exiting
self.readme_limiter.flush()
return results
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
"""Setup environment and prepare model for quantisation.
Returns:
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
"""
model_source = self.url_parser.parse(url)
self._print_model_info(model_source)
self.models_dir.mkdir(parents=True, exist_ok=True)
f16_model_path = self.model_manager.prepare_model(model_source)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
model_dir = self.models_dir / model_source.model_name
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
# If no imatrix found, offer to generate or provide one
if not imatrix_path:
# First offer to generate
imatrix_path = self.imatrix_generator.prompt_for_generation(
model_source, model_dir, f16_model_path
)
# If generation was skipped, offer to provide existing one
if not imatrix_path:
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
return model_source, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
"""Create initial repository with planned quantisations."""
logger.info("Creating initial README with planned quantisations...")
quantisation_types = self.get_quantisation_types()
planned_results = {
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
for qt in quantisation_types
}
readme_path = self.readme_generator.generate(
model_source, planned_results, self.models_dir, output_repo
)
if not self.no_upload:
logger.info("Creating repository with planned quantisations...")
self.uploader.upload_readme(output_repo, readme_path)
else:
logger.info("Skipping repository creation (--no-upload specified)")
def _execute_quantisations(
self,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
quantisation_types = self.get_quantisation_types()
types_list = [qt.value for qt in quantisation_types]
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
# Check architecture support upfront
architecture_unsupported = self._check_architecture_support(f16_model_path)
if architecture_unsupported:
logger.warning("⚠️ Architecture not supported by llama.cpp - K-quants will be skipped")
logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")
# Pre-mark all K-quants as skipped
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
for quant_type in quantisation_types:
if quant_type.value not in basic_types:
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message="K-quant requires llama.cpp architecture support",
)
# Track F16 in results for status display (if we converted from SafeTensors)
if not model_source.is_gguf_repo:
# Get F16 file size
f16_size = "-"
if f16_model_path.exists():
size_bytes = f16_model_path.stat().st_size
size_gb = size_bytes / (1024**3)
f16_size = f"{size_gb:.1f}GB"
# Create a simple object for F16 tracking (not a QuantisationResult)
# since F16 isn't a quantisation type in our enum
f16_result = type(
"F16Result",
(),
{
"quantisation_type": "F16",
"success": True,
"status": "planned",
"file_path": f16_model_path,
"file_size": f16_size,
},
)()
results[QuantisationType.F16] = f16_result
# Process with parallel uploads - quantise sequentially but upload in background
upload_futures: list[Any] = []
architecture_unsupported = False
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
# Start F16 upload first if we have one
if (
not model_source.is_gguf_repo
and not self.no_upload
and QuantisationType.F16 in results
):
f16_result = results[QuantisationType.F16]
if f16_result.file_path and f16_result.file_path.exists():
logger.info("Starting parallel upload of F16 GGUF...")
f16_result.status = "uploading"
self._update_readme_status(model_source, results, output_repo)
upload_future = upload_executor.submit(
self._upload_f16_and_cleanup,
output_repo,
f16_result.file_path,
model_source,
results,
)
upload_futures.append(upload_future)
for i, quant_type in enumerate(quantisation_types, 1):
# Skip if already marked as failed (e.g., K-quants for unsupported arch)
if quant_type in results and results[quant_type].status == "failed":
logger.info(
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
)
continue
logger.info(
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
)
logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
logger.debug(f"DEBUG: Current type: {quant_type.value}")
logger.debug(f"DEBUG: Results so far: {len(results)} completed")
try:
result = self._process_single_quantisation(
quant_type,
model_source,
f16_model_path,
imatrix_path,
output_repo,
results,
upload_executor,
upload_futures,
)
results[quant_type] = result
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
# Check if this failed due to unsupported architecture
if (
not result.success
and hasattr(self.quantisation_engine.executor, "last_error")
and self.quantisation_engine.executor.last_error
== "unsupported_architecture"
):
logger.warning(
"⚠️ Architecture not supported by llama.cpp - K-quants will be skipped"
)
logger.info(
"💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated"
)
architecture_unsupported = True
# Update the current result to also show as skipped
result.error_message = "Architecture not supported by llama.cpp"
# Update README immediately to show remaining K-quants as skipped
# But don't mark basic types as failed - they can still use GGML
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
for remaining_quant_type in quantisation_types[i:]:
if remaining_quant_type not in results:
# Only mark K-quants as failed due to architecture
if remaining_quant_type.value not in basic_types:
results[remaining_quant_type] = QuantisationResult(
quantisation_type=remaining_quant_type,
success=False,
status="failed",
error_message="K-quant requires llama.cpp architecture support",
)
self._update_readme_status(model_source, results, output_repo)
# Force cleanup between quantisations
gc.collect()
logger.debug("DEBUG: Garbage collection completed")
except Exception as e:
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message=str(e),
)
# Force cleanup after error
gc.collect()
# Wait for all uploads to complete before returning
self._wait_for_uploads(upload_futures)
# Final README update to ensure all statuses are accurate
if not self.no_upload and upload_futures:
logger.info("Updating README with final status...")
final_readme = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, final_readme)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
self._handle_quantisation_result(
result,
quant_type,
model_source,
results,
output_repo,
upload_executor,
upload_futures,
)
except Exception as e:
return self._handle_quantisation_error(
e, quant_type, model_source, results, output_repo
)
else:
return result
def _process_single_quantisation_sequential(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
) -> QuantisationResult:
"""Process a single quantisation type sequentially with immediate upload.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
# Force cleanup before starting new quantisation
gc.collect()
# Log system state before quantisation
process = psutil.Process()
logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
logger.debug(f"DEBUG: Process alive: {process.is_running()}")
logger.debug(f"DEBUG: PID: {process.pid}")
logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
logger.debug(f"DEBUG: Threads: {process.num_threads()}")
logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
if result.success and result.file_path:
# Upload immediately (if not in no-upload mode)
if not self.no_upload:
logger.info(f"Uploading {quant_type.value}...")
try:
self.uploader.upload_model_file(output_repo, result.file_path)
logger.info(f"Upload of {quant_type.value} completed successfully")
# Clean up file after successful upload
logger.info(f"Removing {result.file_path.name} to save disk space...")
result.file_path.unlink()
result.status = "completed"
self._update_readme_status(model_source, results, output_repo)
except Exception as upload_error:
logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
result.status = "failed"
result.error_message = str(upload_error)
self._update_readme_status(model_source, results, output_repo)
# Keep file if upload failed
else:
# No upload mode - just mark as completed
result.status = "completed"
logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
except Exception as e:
logger.error(f"Error processing {quant_type.value}: {e}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(e)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
# Force cleanup after error
gc.collect()
return result
else:
# Force cleanup after quantisation
gc.collect()
return result
def _handle_quantisation_result(
self,
result: QuantisationResult,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Handle successful or failed quantisation result."""
if result.success and result.file_path:
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
def _handle_quantisation_error(
self,
error: Exception,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> QuantisationResult:
"""Handle quantisation processing error.
Returns:
QuantisationResult: Failed quantisation result with error information.
"""
logger.error(f"Error processing {quant_type.value}: {error}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(error)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Update README with current quantisation status using rate limiting."""
if not self.no_upload:
# Use rate limiter to batch updates
self.readme_limiter.request_update(
self._do_readme_update,
model_source,
results,
output_repo,
)
def _do_readme_update(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Actually perform the README update (called by rate limiter)."""
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
if not upload_futures:
return
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
completed = 0
failed = 0
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
completed += 1
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
except Exception as e:
failed += 1
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
if failed > 0:
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
else:
logger.info(f"All {completed} uploads completed successfully")
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
"""Clean up temporary files after processing."""
if f16_model_path.exists():
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
f16_model_path.unlink()
if not model_source.is_gguf_repo:
self._cleanup_original_model(model_source)
def _cleanup_original_model(self, model_source: ModelSource) -> None:
"""Clean up original safetensors/PyTorch files after successful conversion."""
model_dir = self.models_dir / model_source.model_name
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
if pytorch_files:
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
for file in pytorch_files:
file.unlink()
logger.info("Keeping config files, tokeniser, and metadata for reference")
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
file_path.unlink()
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
)
# Don't re-raise - let other uploads continue
def _upload_f16_and_cleanup(
self,
output_repo: str,
file_path: Path,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
) -> None:
"""Upload F16 file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
# Don't delete F16 yet - we still need it for quantisations
# It will be deleted in _cleanup_files after all quantisations complete
results[QuantisationType.F16].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info("[PARALLEL] F16 upload complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
results[QuantisationType.F16].status = "failed"
results[QuantisationType.F16].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
)
# Don't re-raise - let other uploads continue
def _print_model_info(self, model_source: ModelSource) -> None:
"""Print model information."""
logger.info(f"Source URL: {model_source.url}")
logger.info(f"Source model: {model_source.source_model}")
logger.info(f"Original author: {model_source.original_author}")
logger.info(f"Model name: {model_source.model_name}")
logger.info(f"Your HF username: {self.uploader.get_username()}")
logger.info(f"Working directory: {self.work_dir}")
def _print_completion_summary(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Print completion summary."""
successful_results = [r for r in results.values() if r.success]
if successful_results:
logger.info("Complete! Your quantised models are available at:")
logger.info(f" https://huggingface.co/{output_repo}")
logger.info("Model info:")
logger.info(f" - Source URL: {model_source.url}")
logger.info(f" - Original: {model_source.source_model}")
logger.info(
" - Method: "
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
)
logger.info(f" - Quantised: {output_repo}")
for result in successful_results:
if result.file_size:
filename = (
f"{model_source.original_author}-{model_source.model_name}-"
f"{result.quantisation_type}.gguf"
)
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
else:
logger.error(
"All quantisations failed - repository created with documentation "
"but no model files"
)
logger.error(f" Repository: https://huggingface.co/{output_repo}")