llm-gguf-tools/helpers/quantisation/executor.py

"""Quantisation execution management.

Handles the execution of quantisation operations including parallel
uploads, status tracking, and error handling.
"""

from __future__ import annotations

import gc
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import TYPE_CHECKING, Any

from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.logger import logger
from helpers.models.quantisation import (
    QuantisationContext,
    QuantisationResult,
    QuantisationType,
)
from helpers.quantisation.progress import ProgressReporter
from helpers.utils.rate_limiter import ReadmeRateLimiter

if TYPE_CHECKING:
    from pathlib import Path

    from helpers.filesystem import FileCleanup
    from helpers.huggingface import HuggingFaceUploader
    from helpers.models.quantisation import ModelSource
    from helpers.quantisation.engine import QuantisationEngine
    from helpers.readme import ReadmeGenerator


class QuantisationExecutor:
    """Executes quantisation operations with parallel upload support.

    Manages the execution of multiple quantisations with background
    uploads, status tracking, and proper error handling.
    """

    def __init__(
        self,
        quantisation_engine: QuantisationEngine,
        uploader: HuggingFaceUploader,
        readme_generator: ReadmeGenerator,
        file_cleanup: FileCleanup,
        no_upload: bool = False,
    ) -> None:
        """Initialise quantisation executor.

        Sets up the quantisation executor with all required service dependencies
        for performing quantisations, uploading results, generating documentation,
        and cleaning up temporary files. Configures upload behaviour based on settings.
        """
        self.quantisation_engine = quantisation_engine
        self.uploader = uploader
        self.readme_generator = readme_generator
        self.file_cleanup = file_cleanup
        self.no_upload = no_upload
        self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
        self.progress_reporter = ProgressReporter()

    def execute_quantisations(
        self,
        model_source: ModelSource,
        f16_model_path: Path,
        imatrix_path: Path | None,
        output_repo: str,
        quantisation_types: list[QuantisationType],
        models_dir: Path,
    ) -> dict[QuantisationType, QuantisationResult]:
        """Execute all quantisation types with parallel uploads.

        Orchestrates the complete quantisation workflow including F16 processing,
        multiple quantisation type execution, parallel upload management, and
        README generation. Handles all aspects of the quantisation pipeline
        from initial setup through final documentation.

        Returns:
            Dictionary of quantisation results by type.
        """
        results: dict[QuantisationType, QuantisationResult] = {}

        # Track F16 in results if we converted from SafeTensors
        if not model_source.is_gguf_repo:
            results[QuantisationType.F16] = self._create_f16_result(f16_model_path)

        # Process with parallel uploads
        upload_futures: list[Any] = []

        with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
            # Start F16 upload if applicable
            if (
                not model_source.is_gguf_repo
                and not self.no_upload
                and QuantisationType.F16 in results
            ):
                self._start_f16_upload(
                    results,
                    model_source,
                    output_repo,
                    f16_model_path,
                    upload_executor,
                    upload_futures,
                )

            # Process each quantisation
            for i, quant_type in enumerate(quantisation_types, 1):
                # Skip if already marked as failed
                if quant_type in results and results[quant_type].status == "failed":
                    logger.info(
                        f"Skipping {quant_type.value} - {results[quant_type].error_message}"
                    )
                    continue

                self.progress_reporter.print_quantisation_start(
                    i, len(quantisation_types), quant_type.value
                )

                try:
                    result = self._process_single_quantisation(
                        quant_type,
                        model_source,
                        f16_model_path,
                        imatrix_path,
                        output_repo,
                        results,
                        models_dir,
                        upload_executor,
                        upload_futures,
                    )
                    results[quant_type] = result

                    # Force cleanup between quantisations
                    gc.collect()

                except Exception as e:
                    logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
                    logger.error("Exception traceback:")
                    for line in traceback.format_exc().splitlines():
                        logger.error(f"  {line}")

                    results[quant_type] = QuantisationResult(
                        quantisation_type=quant_type,
                        success=False,
                        status="failed",
                        error_message=str(e),
                    )

                    # Force cleanup after error
                    gc.collect()

            # Wait for all uploads to complete
            self._wait_for_uploads(upload_futures)

            # Final README update
            if not self.no_upload and upload_futures:
                self._final_readme_update(model_source, results, models_dir, output_repo)

        return results

    def _process_single_quantisation(
        self,
        quant_type: QuantisationType,
        model_source: ModelSource,
        f16_model_path: Path,
        imatrix_path: Path | None,
        output_repo: str,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
        upload_executor: ThreadPoolExecutor,
        upload_futures: list,
    ) -> QuantisationResult:
        """Process a single quantisation type.

        Returns:
            QuantisationResult for the processed type.
        """
        try:
            logger.info(f"Starting {quant_type.value} quantisation...")
            config = QUANTISATION_CONFIGS[quant_type]

            # Create initial result and update status
            result = QuantisationResult(quantisation_type=quant_type, success=False)
            result.status = "processing"
            results[quant_type] = result

            self._update_readme_status(model_source, results, models_dir, output_repo)

            # Perform quantisation
            context = QuantisationContext(
                f16_model_path=f16_model_path,
                model_source=model_source,
                config=config,
                models_dir=models_dir,
                imatrix_path=imatrix_path,
            )
            result = self.quantisation_engine.quantise(context)

            # Handle result
            if result.success and result.file_path:
                self._start_parallel_upload(
                    result,
                    quant_type,
                    output_repo,
                    model_source,
                    results,
                    models_dir,
                    upload_executor,
                    upload_futures,
                )
            else:
                result.status = "failed"
                self._update_readme_status(model_source, results, models_dir, output_repo)

        except Exception as e:
            logger.error(f"Error processing {quant_type.value}: {e}")
            result = QuantisationResult(quantisation_type=quant_type, success=False)
            result.status = "failed"
            result.error_message = str(e)

            try:
                self._update_readme_status(model_source, results, models_dir, output_repo)
            except Exception as readme_error:
                logger.error(f"Failed to update README after error: {readme_error}")

        return result

    def _start_parallel_upload(
        self,
        result: QuantisationResult,
        quant_type: QuantisationType,
        output_repo: str,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
        upload_executor: ThreadPoolExecutor,
        upload_futures: list,
    ) -> None:
        """Start parallel upload of quantisation result."""
        if self.no_upload or not result.file_path:
            return

        quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
        logger.info(f"Starting parallel upload of {quant_str}...")

        upload_future = upload_executor.submit(
            self._upload_and_cleanup,
            output_repo,
            result.file_path,
            quant_type,
            model_source,
            results,
            models_dir,
        )
        upload_futures.append(upload_future)

        result.file_path = None  # Mark as being uploaded
        result.status = "uploading"
        self._update_readme_status(model_source, results, models_dir, output_repo)

    def _upload_and_cleanup(
        self,
        output_repo: str,
        file_path: Path,
        quant_type: QuantisationType,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
    ) -> None:
        """Upload file and clean up (runs in background thread)."""
        try:
            logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
            self.uploader.upload_model_file(output_repo, file_path)
            logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")

            self.file_cleanup.cleanup_quantisation_file(file_path)

            results[quant_type].status = "completed"
            updated_readme_path = self.readme_generator.generate(
                model_source, results, models_dir, output_repo
            )
            self.uploader.upload_readme(output_repo, updated_readme_path)

            logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")

        except Exception as e:
            logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
            results[quant_type].status = "failed"
            results[quant_type].error_message = str(e)

            try:
                updated_readme_path = self.readme_generator.generate(
                    model_source, results, models_dir, output_repo
                )
                self.uploader.upload_readme(output_repo, updated_readme_path)
            except Exception as readme_error:
                logger.error(
                    f"[PARALLEL] Failed to update README after upload error: {readme_error}"
                )

    def _start_f16_upload(
        self,
        results: dict[QuantisationType, QuantisationResult],
        model_source: ModelSource,
        output_repo: str,
        f16_model_path: Path,
        upload_executor: ThreadPoolExecutor,
        upload_futures: list,
    ) -> None:
        """Start F16 upload in background."""
        f16_result = results[QuantisationType.F16]
        if f16_result.file_path and f16_result.file_path.exists():
            logger.info("Starting parallel upload of F16 GGUF...")
            f16_result.status = "uploading"
            self._update_readme_status(
                model_source, results, f16_model_path.parent.parent, output_repo
            )

            upload_future = upload_executor.submit(
                self._upload_f16_and_cleanup,
                output_repo,
                f16_result.file_path,
                model_source,
                results,
                f16_model_path.parent.parent,
            )
            upload_futures.append(upload_future)

    def _upload_f16_and_cleanup(
        self,
        output_repo: str,
        file_path: Path,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
    ) -> None:
        """Upload F16 file and update status (runs in background thread)."""
        try:
            logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
            self.uploader.upload_model_file(output_repo, file_path)
            logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")

            # Don't delete F16 yet - still needed for quantisations

            results[QuantisationType.F16].status = "completed"
            updated_readme_path = self.readme_generator.generate(
                model_source, results, models_dir, output_repo
            )
            self.uploader.upload_readme(output_repo, updated_readme_path)

            logger.info("[PARALLEL] F16 upload complete")

        except Exception as e:
            logger.error(f"[PARALLEL] Failed to upload F16: {e}")
            results[QuantisationType.F16].status = "failed"
            results[QuantisationType.F16].error_message = str(e)

            try:
                updated_readme_path = self.readme_generator.generate(
                    model_source, results, models_dir, output_repo
                )
                self.uploader.upload_readme(output_repo, updated_readme_path)
            except Exception as readme_error:
                logger.error(
                    f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
                )

    def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult:
        """Create a result object for F16 tracking.

        Returns:
            QuantisationResult object for F16 tracking.
        """
        f16_size = "-"
        if f16_model_path.exists():
            size_bytes = f16_model_path.stat().st_size
            size_gb = size_bytes / (1024**3)
            f16_size = f"{size_gb:.1f}GB"

        # Create a simple result object for F16 tracking
        return type(
            "F16Result",
            (),
            {
                "quantisation_type": "F16",
                "success": True,
                "status": "planned",
                "file_path": f16_model_path,
                "file_size": f16_size,
            },
        )()

    def _wait_for_uploads(self, upload_futures: list) -> None:
        """Wait for all parallel uploads to complete."""
        if not upload_futures:
            return

        logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
        completed = 0
        failed = 0

        for future in upload_futures:
            try:
                future.result(timeout=300)  # 5 minute timeout per upload
                completed += 1
                logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
            except Exception as e:
                failed += 1
                logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")

        self.progress_reporter.print_upload_summary(completed, failed)

    def _update_readme_status(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
        output_repo: str,
    ) -> None:
        """Update README with current quantisation status using rate limiting."""
        if not self.no_upload:
            # Use rate limiter to batch updates
            self.readme_limiter.request_update(
                self._do_readme_update,
                model_source,
                results,
                models_dir,
                output_repo,
            )

    def _do_readme_update(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
        output_repo: str,
    ) -> None:
        """Actually perform the README update (called by rate limiter)."""
        updated_readme_path = self.readme_generator.generate(
            model_source, results, models_dir, output_repo
        )
        self.uploader.upload_readme(output_repo, updated_readme_path)

    def _final_readme_update(
        self,
        model_source: ModelSource,
        results: dict[QuantisationType, QuantisationResult],
        models_dir: Path,
        output_repo: str,
    ) -> None:
        """Perform final README update after all operations."""
        logger.info("Updating README with final status...")
        final_readme = self.readme_generator.generate(
            model_source, results, models_dir, output_repo
        )
        self.uploader.upload_readme(output_repo, final_readme)