llm-gguf-tools/helpers/services/orchestrator.py

"""Quantisation orchestration service.

High-level orchestration of the complete quantisation workflow from model
acquisition through processing to upload. Manages parallel processing,
status tracking, and cleanup operations for efficient resource utilisation.
"""

from __future__ import annotations

import gc
import signal
import subprocess
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING

import psutil

from helpers.config.quantisation_configs import (
    DEFAULT_QUANTISATION_TYPES,
    QUANTISATION_CONFIGS,
    SUPPORTED_QUANTISATION_TYPES,
)
from helpers.logger import logger
from helpers.models.quantisation import (
    ModelSource,
    QuantisationContext,
    QuantisationResult,
    QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.imatrix_generator import IMatrixGenerator
from helpers.services.llama_cpp import IMatrixHandler
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.rate_limiter import ReadmeRateLimiter
from helpers.utils.tensor_mapping import URLParser

if TYPE_CHECKING:
    from types import FrameType
    from typing import Any


@dataclass(slots=True)
class QuantisationOrchestrator:
    """Orchestrates the complete quantisation workflow.

    Uses dataclass with slots for efficient memory usage and dependency injection
    for modular service interaction following SOLID principles.
    """

    work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
    use_imatrix: bool = True
    no_upload: bool = False
    custom_profiles: list[str] | None = None

    # Service dependencies with factory defaults
    url_parser: URLParser = field(default_factory=URLParser)
    quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
    imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
    imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
    readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
    uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)

    # Computed properties
    models_dir: Path = field(init=False)
    model_manager: ModelManager = field(init=False)
    readme_limiter: ReadmeRateLimiter = field(init=False)

    def __post_init__(self) -> None:
        """Initialise computed properties after dataclass construction."""
        self.models_dir = self.work_dir / "models"
        self.model_manager = ModelManager(self.models_dir)
        self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)

        # Set up signal handlers for graceful exit tracking
        self._setup_signal_handlers()

    def _setup_signal_handlers(self) -> None:
        """Set up signal handlers to catch unexpected exits."""

        def signal_handler(signum: int, frame: FrameType | None) -> None:
            logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
            logger.error("Stack trace at signal:")
            if frame:
                for line in traceback.format_stack(frame):
                    logger.error(f"  {line.strip()}")
            logger.error("Exiting due to signal")
            sys.exit(1)

        # Handle common termination signals
        for sig in [signal.SIGINT, signal.SIGTERM]:
            signal.signal(sig, signal_handler)

    def _check_architecture_support(self, f16_model_path: Path) -> bool:
        """Check if the model architecture is supported by llama.cpp.

        Args:
            f16_model_path: Path to the F16 GGUF model

        Returns:
            True if architecture is NOT supported (K-quants should be skipped)
        """
        try:
            # Try a simple quantization with llama.cpp to check support
            result = subprocess.run(
                [
                    ".cache/llm-gguf-tools/binaries/llama-quantize",
                    str(f16_model_path),
                    "/dev/null",
                    "Q4_K_M",
                ],
                check=False,
                capture_output=True,
                text=True,
                timeout=5,
            )

            # Check if it failed due to unknown architecture
            return bool(result.stderr and "unknown model architecture" in result.stderr.lower())
        except Exception:
            # If we can't determine, assume it might work
            return False

    def get_quantisation_types(self) -> list[QuantisationType]:
        """Get the quantisation types to use for this run.

        Returns:
            List of QuantisationType enums to process.
        """
        if self.custom_profiles:
            # Parse custom profiles from strings to QuantisationType
            result = []
            for profile_str in self.custom_profiles:
                try:
                    profile = QuantisationType(profile_str.upper())
                    if profile in SUPPORTED_QUANTISATION_TYPES:
                        result.append(profile)
                    else:
                        logger.warning(f"Profile {profile_str} is not supported, skipping")
                except ValueError:
                    logger.warning(f"Invalid profile {profile_str}, skipping")
            return result or DEFAULT_QUANTISATION_TYPES
        return DEFAULT_QUANTISATION_TYPES

    def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
        """Main quantisation workflow orchestrating model processing from URL to upload.

        Returns:
            dict[QuantisationType, QuantisationResult]: Quantisation results for each type.

        Raises:
            KeyboardInterrupt: If the user interrupts the quantisation process.
        """
        logger.info("Starting Bartowski quantisation process...")
        logger.debug(f"DEBUG: Input URL: {url}")
        logger.debug(f"DEBUG: Working directory: {self.work_dir}")
        logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
        logger.debug(f"DEBUG: No upload: {self.no_upload}")
        logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")

        try:
            # Setup and preparation
            logger.debug("DEBUG: Starting environment setup...")
            model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
            logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")

            # Create initial repository
            logger.debug("DEBUG: Creating initial repository...")
            self._create_initial_repository(model_source, output_repo)
            logger.debug("DEBUG: Initial repository created")

            # Execute all quantisations
            logger.debug("DEBUG: Starting quantisation execution...")
            results = self._execute_quantisations(
                model_source, f16_model_path, imatrix_path, output_repo
            )
            logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")

            # Cleanup
            logger.debug("DEBUG: Starting cleanup...")
            self._cleanup_files(f16_model_path, model_source)
            logger.debug("DEBUG: Cleanup complete")

            self._print_completion_summary(model_source, results, output_repo)
        except KeyboardInterrupt:
            logger.error("❌ Process interrupted by user (Ctrl+C)")
            raise
        except Exception as e:
            logger.error(f"❌ Critical error in quantisation workflow: {e}")
            logger.error("Full traceback:")
            for line in traceback.format_exc().splitlines():
                logger.error(f"  {line}")
            raise
        finally:
            # Always flush pending README updates before exiting
            self.readme_limiter.flush()

        return results

    def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
        """Setup environment and prepare model for quantisation.

        Returns:
            Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
        """
        model_source = self.url_parser.parse(url)
        self._print_model_info(model_source)

        self.models_dir.mkdir(parents=True, exist_ok=True)
        f16_model_path = self.model_manager.prepare_model(model_source)

        output_repo = (
            f"{self.uploader.get_username()}/"
            f"{model_source.original_author}-{model_source.model_name}-GGUF"
        )

        imatrix_path = None
        if self.use_imatrix:
            logger.info("Checking for importance matrix (imatrix)...")
            model_dir = self.models_dir / model_source.model_name
            imatrix_path = self.imatrix_handler.find_imatrix(model_dir)

            # If no imatrix found, offer to generate or provide one
            if not imatrix_path:
                # First offer to generate
                imatrix_path = self.imatrix_generator.prompt_for_generation(
                    model_source, model_dir, f16_model_path
                )

                # If generation was skipped, offer to provide existing one
                if not imatrix_path:
                    imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)

        return model_source, f16_model_path, imatrix_path, output_repo

    def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
        """Create initial repository with planned quantisations."""
        logger.info("Creating initial README with planned quantisations...")
        quantisation_types = self.get_quantisation_types()
        planned_results = {
            qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
            for qt in quantisation_types
        }
        readme_path = self.readme_generator.generate(
            model_source, planned_results, self.models_dir, output_repo
        )

        if not self.no_upload:
            logger.info("Creating repository with planned quantisations...")
            self.uploader.upload_readme(output_repo, readme_path)
        else:
            logger.info("Skipping repository creation (--no-upload specified)")

    def _execute_quantisations(
        self,
        model_source: ModelSource,
        f16_model_path: Path,
        imatrix_path: Path | None,
        output_repo: str,
    ) -> dict[QuantisationType, QuantisationResult]:
        """Execute all quantisation types with parallel uploads.

        Returns:
            dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
        """
        results: dict[QuantisationType, QuantisationResult] = {}

        quantisation_types = self.get_quantisation_types()
        types_list = [qt.value for qt in quantisation_types]
        logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")

        # Check architecture support upfront
        architecture_unsupported = self._check_architecture_support(f16_model_path)

        if architecture_unsupported:
            logger.warning("⚠️ Architecture not supported by llama.cpp - K-quants will be skipped")
            logger.info("💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated")

            # Pre-mark all K-quants as skipped
            basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
            for quant_type in quantisation_types:
                if quant_type.value not in basic_types:
                    results[quant_type] = QuantisationResult(
                        quantisation_type=quant_type,
                        success=False,
                        status="failed",
                        error_message="K-quant requires llama.cpp architecture support",
                    )

        # Track F16 in results for status display (if we converted from SafeTensors)
        if not model_source.is_gguf_repo:
            # Get F16 file size
            f16_size = "-"
            if f16_model_path.exists():
                size_bytes = f16_model_path.stat().st_size
                size_gb = size_bytes / (1024**3)
                f16_size = f"{size_gb:.1f}GB"

            # Create a simple object for F16 tracking (not a QuantisationResult)
            # since F16 isn't a quantisation type in our enum
            f16_result = type(
                "F16Result",
                (),
                {
                    "quantisation_type": "F16",
                    "success": True,
                    "status": "planned",
                    "file_path": f16_model_path,
                    "file_size": f16_size,
                },
            )()
            results[QuantisationType.F16] = f16_result

        # Process with parallel uploads - quantise sequentially but upload in background
        upload_futures: list[Any] = []
        architecture_unsupported = False

        with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
            # Start F16 upload first if we have one
            if (
                not model_source.is_gguf_repo
                and not self.no_upload
                and QuantisationType.F16 in results
            ):
                f16_result = results[QuantisationType.F16]
                if f16_result.file_path and f16_result.file_path.exists():
                    logger.info("Starting parallel upload of F16 GGUF...")
                    f16_result.status = "uploading"
                    self._update_readme_status(model_source, results, output_repo)

                    upload_future = upload_executor.submit(
                        self._upload_f16_and_cleanup,
                        output_repo,
                        f16_result.file_path,
                        model_source,
                        results,
                    )
                    upload_futures.append(upload_future)
            for i, quant_type in enumerate(quantisation_types, 1):
                # Skip if already marked as failed (e.g., K-quants for unsupported arch)
                if quant_type in results and results[quant_type].status == "failed":
                    logger.info(
                        f"Skipping {quant_type.value} - {results[quant_type].error_message}"
                    )
                    continue

                logger.info(
                    f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
                )
                logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
                logger.debug(f"DEBUG: Current type: {quant_type.value}")
                logger.debug(f"DEBUG: Results so far: {len(results)} completed")

                try:
                    result = self._process_single_quantisation(
                        quant_type,
                        model_source,
                        f16_model_path,
                        imatrix_path,
                        output_repo,
                        results,
                        upload_executor,
                        upload_futures,
                    )
                    results[quant_type] = result
                    logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")

                    # Check if this failed due to unsupported architecture
                    if (
                        not result.success
                        and hasattr(self.quantisation_engine.executor, "last_error")
                        and self.quantisation_engine.executor.last_error
                        == "unsupported_architecture"
                    ):
                        logger.warning(
                            "⚠️  Architecture not supported by llama.cpp - K-quants will be skipped"
                        )
                        logger.info(
                            "💡 Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated"
                        )
                        architecture_unsupported = True
                        # Update the current result to also show as skipped
                        result.error_message = "Architecture not supported by llama.cpp"
                        # Update README immediately to show remaining K-quants as skipped
                        # But don't mark basic types as failed - they can still use GGML
                        basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
                        for remaining_quant_type in quantisation_types[i:]:
                            if remaining_quant_type not in results:
                                # Only mark K-quants as failed due to architecture
                                if remaining_quant_type.value not in basic_types:
                                    results[remaining_quant_type] = QuantisationResult(
                                        quantisation_type=remaining_quant_type,
                                        success=False,
                                        status="failed",
                                        error_message="K-quant requires llama.cpp architecture support",
                                    )
                        self._update_readme_status(model_source, results, output_repo)

                    # Force cleanup between quantisations
                    gc.collect()
                    logger.debug("DEBUG: Garbage collection completed")

                except Exception as e:
                    logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
                    logger.error("Exception traceback:")
                    for line in traceback.format_exc().splitlines():
                        logger.error(f"  {line}")
                    results[quant_type] = QuantisationResult(
                        quantisation_type=quant_type,
                        success=False,
                        status="failed",
                        error_message=str(e),
                    )

                    # Force cleanup after error
                    gc.collect()

            # Wait for all uploads to complete before returning
            self._wait_for_uploads(upload_futures)

            # Final README update to ensure all statuses are accurate
            if not self.no_upload and upload_futures:
                logger.info("Updating README with final status...")
                final_readme = self.readme_generator.generate(
                    model_source, results, self.models_dir, output_repo
                )
                self.uploader.upload_readme(output_repo, final_readme)

        return results

    def _process_single_quantisation(
        self,
        quant_type: QuantisationType,
        model_source: ModelSource,
        f16_model_path: Path,
        imatrix_path: Path | None,
        output_repo: str,
        results: dict[QuantisationType, QuantisationResult],
        upload_executor: ThreadPoolExecutor,
        upload_futures: list,
    ) -> QuantisationResult:
        """Process a single quantisation type.

        Returns:
            QuantisationResult: Result of the quantisation attempt.
        """
        try:
            logger.info(f"Starting {quant_type.value} quantisation...")
            logger.debug(f"DEBUG: Getting config for {quant_type.value}")
            config = QUANTISATION_CONFIGS[quant_type]
            logger.debug(f"DEBUG: Config loaded: {config.name}")

            # Update status to processing
            logger.debug("DEBUG: Creating initial quantisation result")
            result = QuantisationResult(quantisation_type=quant_type, success=False)
            result.status = "processing"
            results[quant_type] = result

            logger.debug("DEBUG: Updating README status")
            self._update_readme_status(model_source, results, output_repo)

            # Perform quantisation
            logger.debug("DEBUG: Creating quantisation context")
            context = QuantisationContext(
                f16_model_path=f16_model_path,
                model_source=model_source,
                config=config,
                models_dir=self.models_dir,
                imatrix_path=imatrix_path,
            )
            logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
            logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
            logger.debug("DEBUG: Calling quantisation engine...")
            result = self.quantisation_engine.quantise(context)
            logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")

            self._handle_quantisation_result(
                result,
                quant_type,
                model_source,
                results,
                output_repo,
                upload_executor,
                upload_futures,
            )
        except Exception as e:
            return self._handle_quantisation_error(
                e, quant_type, model_source, results, output_repo
            )
        else:
            return result

    def _process_single_quantisation_sequential(
        self,
        quant_type: QuantisationType,
        model_source: ModelSource,
        f16_model_path: Path,
        imatrix_path: Path | None,
        output_repo: str,
        results: dict[QuantisationType, QuantisationResult],
    ) -> QuantisationResult:
        """Process a single quantisation type sequentially with immediate upload.

        Returns:
            QuantisationResult: Result of the quantisation attempt.
        """
        # Force cleanup before starting new quantisation
        gc.collect()

        # Log system state before quantisation
        process = psutil.Process()
        logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
        logger.debug(f"DEBUG: Process alive: {process.is_running()}")
        logger.debug(f"DEBUG: PID: {process.pid}")
        logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
        logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
        logger.debug(f"DEBUG: Threads: {process.num_threads()}")
        logger.debug(f"DEBUG: Open files: {len(process.open_files())}")

        try:
            logger.info(f"Starting {quant_type.value} quantisation...")
            logger.debug(f"DEBUG: Getting config for {quant_type.value}")
            config = QUANTISATION_CONFIGS[quant_type]
            logger.debug(f"DEBUG: Config loaded: {config.name}")

            # Update status to processing
            logger.debug("DEBUG: Creating initial quantisation result")
            result = QuantisationResult(quantisation_type=quant_type, success=False)
            result.status = "processing"
            results[quant_type] = result

            logger.debug("DEBUG: Updating README status")
            self._update_readme_status(model_source, results, output_repo)

            # Perform quantisation
            logger.debug("DEBUG: Creating quantisation context")
            context = QuantisationContext(
                f16_model_path=f16_model_path,
                model_source=model_source,
                config=config,
                models_dir=self.models_dir,
                imatrix_path=imatrix_path,
            )
            logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
            logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
            logger.debug("DEBUG: Calling quantisation engine...")
            result = self.quantisation_engine.quantise(context)
            logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")

            if result.success and result.file_path:
                # Upload immediately (if not in no-upload mode)
                if not self.no_upload:
                    logger.info(f"Uploading {quant_type.value}...")
                    try:
                        self.uploader.upload_model_file(output_repo, result.file_path)
                        logger.info(f"Upload of {quant_type.value} completed successfully")

                        # Clean up file after successful upload
                        logger.info(f"Removing {result.file_path.name} to save disk space...")
                        result.file_path.unlink()

                        result.status = "completed"
                        self._update_readme_status(model_source, results, output_repo)
                    except Exception as upload_error:
                        logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
                        result.status = "failed"
                        result.error_message = str(upload_error)
                        self._update_readme_status(model_source, results, output_repo)
                        # Keep file if upload failed
                else:
                    # No upload mode - just mark as completed
                    result.status = "completed"
                    logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
            else:
                result.status = "failed"
                self._update_readme_status(model_source, results, output_repo)
        except Exception as e:
            logger.error(f"Error processing {quant_type.value}: {e}")
            result = QuantisationResult(quantisation_type=quant_type, success=False)
            result.status = "failed"
            result.error_message = str(e)

            try:
                self._update_readme_status(model_source, results, output_repo)
            except Exception as readme_error:
                logger.error(f"Failed to update README after error: {readme_error}")
            # Force cleanup after error
            gc.collect()
            return result
        else:
            # Force cleanup after quantisation
            gc.collect()
            return result

    def _handle_quantisation_result(
        self,
        result: QuantisationResult,
        quant_type: QuantisationType,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        output_repo: str,
        upload_executor: ThreadPoolExecutor,
        upload_futures: list,
    ) -> None:
        """Handle successful or failed quantisation result."""
        if result.success and result.file_path:
            quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
            logger.info(f"Starting parallel upload of {quant_str}...")
            upload_future = upload_executor.submit(
                self._upload_and_cleanup,
                output_repo,
                result.file_path,
                quant_type,
                model_source,
                results,
            )
            upload_futures.append(upload_future)
            result.file_path = None  # Mark as being uploaded
            result.status = "uploading"
        else:
            result.status = "failed"

        self._update_readme_status(model_source, results, output_repo)

    def _handle_quantisation_error(
        self,
        error: Exception,
        quant_type: QuantisationType,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        output_repo: str,
    ) -> QuantisationResult:
        """Handle quantisation processing error.

        Returns:
            QuantisationResult: Failed quantisation result with error information.
        """
        logger.error(f"Error processing {quant_type.value}: {error}")
        result = QuantisationResult(quantisation_type=quant_type, success=False)
        result.status = "failed"
        result.error_message = str(error)

        try:
            self._update_readme_status(model_source, results, output_repo)
        except Exception as readme_error:
            logger.error(f"Failed to update README after error: {readme_error}")

        return result

    def _update_readme_status(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        output_repo: str,
    ) -> None:
        """Update README with current quantisation status using rate limiting."""
        if not self.no_upload:
            # Use rate limiter to batch updates
            self.readme_limiter.request_update(
                self._do_readme_update,
                model_source,
                results,
                output_repo,
            )

    def _do_readme_update(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        output_repo: str,
    ) -> None:
        """Actually perform the README update (called by rate limiter)."""
        updated_readme_path = self.readme_generator.generate(
            model_source, results, self.models_dir, output_repo
        )
        self.uploader.upload_readme(output_repo, updated_readme_path)

    def _wait_for_uploads(self, upload_futures: list) -> None:
        """Wait for all parallel uploads to complete."""
        if not upload_futures:
            return

        logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
        completed = 0
        failed = 0

        for future in upload_futures:
            try:
                future.result(timeout=300)  # 5 minute timeout per upload
                completed += 1
                logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
            except Exception as e:
                failed += 1
                logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")

        if failed > 0:
            logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
        else:
            logger.info(f"All {completed} uploads completed successfully")

    def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
        """Clean up temporary files after processing."""
        if f16_model_path.exists():
            logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
            f16_model_path.unlink()

        if not model_source.is_gguf_repo:
            self._cleanup_original_model(model_source)

    def _cleanup_original_model(self, model_source: ModelSource) -> None:
        """Clean up original safetensors/PyTorch files after successful conversion."""
        model_dir = self.models_dir / model_source.model_name

        pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
        if pytorch_files:
            logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
            for file in pytorch_files:
                file.unlink()

        logger.info("Keeping config files, tokeniser, and metadata for reference")

    def _upload_and_cleanup(
        self,
        output_repo: str,
        file_path: Path,
        quant_type: QuantisationType,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
    ) -> None:
        """Upload file and clean up (runs in background thread)."""
        try:
            logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
            self.uploader.upload_model_file(output_repo, file_path)
            logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")

            logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
            file_path.unlink()

            results[quant_type].status = "completed"
            updated_readme_path = self.readme_generator.generate(
                model_source, results, self.models_dir, output_repo
            )
            self.uploader.upload_readme(output_repo, updated_readme_path)

            logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
        except Exception as e:
            logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
            results[quant_type].status = "failed"
            results[quant_type].error_message = str(e)

            try:
                updated_readme_path = self.readme_generator.generate(
                    model_source, results, self.models_dir, output_repo
                )
                self.uploader.upload_readme(output_repo, updated_readme_path)
            except Exception as readme_error:
                logger.error(
                    f"[PARALLEL] Failed to update README after upload error: {readme_error}"
                )
            # Don't re-raise - let other uploads continue

    def _upload_f16_and_cleanup(
        self,
        output_repo: str,
        file_path: Path,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
    ) -> None:
        """Upload F16 file and clean up (runs in background thread)."""
        try:
            logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
            self.uploader.upload_model_file(output_repo, file_path)
            logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")

            # Don't delete F16 yet - we still need it for quantisations
            # It will be deleted in _cleanup_files after all quantisations complete

            results[QuantisationType.F16].status = "completed"
            updated_readme_path = self.readme_generator.generate(
                model_source, results, self.models_dir, output_repo
            )
            self.uploader.upload_readme(output_repo, updated_readme_path)

            logger.info("[PARALLEL] F16 upload complete")
        except Exception as e:
            logger.error(f"[PARALLEL] Failed to upload F16: {e}")
            results[QuantisationType.F16].status = "failed"
            results[QuantisationType.F16].error_message = str(e)

            try:
                updated_readme_path = self.readme_generator.generate(
                    model_source, results, self.models_dir, output_repo
                )
                self.uploader.upload_readme(output_repo, updated_readme_path)
            except Exception as readme_error:
                logger.error(
                    f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
                )
            # Don't re-raise - let other uploads continue

    def _print_model_info(self, model_source: ModelSource) -> None:
        """Print model information."""
        logger.info(f"Source URL: {model_source.url}")
        logger.info(f"Source model: {model_source.source_model}")
        logger.info(f"Original author: {model_source.original_author}")
        logger.info(f"Model name: {model_source.model_name}")
        logger.info(f"Your HF username: {self.uploader.get_username()}")
        logger.info(f"Working directory: {self.work_dir}")

    def _print_completion_summary(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        output_repo: str,
    ) -> None:
        """Print completion summary."""
        successful_results = [r for r in results.values() if r.success]

        if successful_results:
            logger.info("Complete! Your quantised models are available at:")
            logger.info(f"   https://huggingface.co/{output_repo}")
            logger.info("Model info:")
            logger.info(f"   - Source URL: {model_source.url}")
            logger.info(f"   - Original: {model_source.source_model}")
            logger.info(
                "   - Method: "
                f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
            )
            logger.info(f"   - Quantised: {output_repo}")

            for result in successful_results:
                if result.file_size:
                    filename = (
                        f"{model_source.original_author}-{model_source.model_name}-"
                        f"{result.quantisation_type}.gguf"
                    )
                    logger.info(f"   - {result.quantisation_type}: {filename} ({result.file_size})")
        else:
            logger.error(
                "All quantisations failed - repository created with documentation "
                "but no model files"
            )
            logger.error(f"   Repository: https://huggingface.co/{output_repo}")