"""Quantisation orchestration service. High-level orchestration of the complete quantisation workflow from model acquisition through processing to upload. Manages parallel processing, status tracking, and cleanup operations for efficient resource utilisation. """ from __future__ import annotations import gc import signal import sys import traceback from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING import psutil from helpers.config.quantisation_configs import ( DEFAULT_QUANTISATION_TYPES, QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES, ) from helpers.logger import logger from helpers.models.quantisation import ( ModelSource, QuantisationContext, QuantisationResult, QuantisationType, ) from helpers.services.huggingface import ReadmeGenerator from helpers.services.llama_cpp import IMatrixManager from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine from helpers.utils.tensor_mapping import URLParser if TYPE_CHECKING: from types import FrameType @dataclass(slots=True) class QuantisationOrchestrator: """Orchestrates the complete quantisation workflow. Uses dataclass with slots for efficient memory usage and dependency injection for modular service interaction following SOLID principles. """ work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work") use_imatrix: bool = True no_upload: bool = False custom_profiles: list[str] | None = None # Service dependencies with factory defaults url_parser: URLParser = field(default_factory=URLParser) quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine) imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager) readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator) uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader) # Computed properties models_dir: Path = field(init=False) model_manager: ModelManager = field(init=False) def __post_init__(self) -> None: """Initialise computed properties after dataclass construction.""" self.models_dir = self.work_dir / "models" self.model_manager = ModelManager(self.models_dir) # Set up signal handlers for graceful exit tracking self._setup_signal_handlers() def _setup_signal_handlers(self) -> None: """Set up signal handlers to catch unexpected exits.""" def signal_handler(signum: int, frame: FrameType | None) -> None: logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})") logger.error("Stack trace at signal:") if frame: for line in traceback.format_stack(frame): logger.error(f" {line.strip()}") logger.error("Exiting due to signal") sys.exit(1) # Handle common termination signals for sig in [signal.SIGINT, signal.SIGTERM]: signal.signal(sig, signal_handler) def get_quantisation_types(self) -> list[QuantisationType]: """Get the quantisation types to use for this run. Returns: List of QuantisationType enums to process. """ if self.custom_profiles: # Parse custom profiles from strings to QuantisationType result = [] for profile_str in self.custom_profiles: try: profile = QuantisationType(profile_str.upper()) if profile in SUPPORTED_QUANTISATION_TYPES: result.append(profile) else: logger.warning(f"Profile {profile_str} is not supported, skipping") except ValueError: logger.warning(f"Invalid profile {profile_str}, skipping") return result or DEFAULT_QUANTISATION_TYPES return DEFAULT_QUANTISATION_TYPES def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]: """Main quantisation workflow orchestrating model processing from URL to upload. Returns: dict[QuantisationType, QuantisationResult]: Quantisation results for each type. Raises: KeyboardInterrupt: If the user interrupts the quantisation process. """ logger.info("Starting Bartowski quantisation process...") logger.debug(f"DEBUG: Input URL: {url}") logger.debug(f"DEBUG: Working directory: {self.work_dir}") logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}") logger.debug(f"DEBUG: No upload: {self.no_upload}") logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}") try: # Setup and preparation logger.debug("DEBUG: Starting environment setup...") model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url) logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}") # Create initial repository logger.debug("DEBUG: Creating initial repository...") self._create_initial_repository(model_source, output_repo) logger.debug("DEBUG: Initial repository created") # Execute all quantisations logger.debug("DEBUG: Starting quantisation execution...") results = self._execute_quantisations( model_source, f16_model_path, imatrix_path, output_repo ) logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items") # Cleanup logger.debug("DEBUG: Starting cleanup...") self._cleanup_files(f16_model_path, model_source) logger.debug("DEBUG: Cleanup complete") self._print_completion_summary(model_source, results, output_repo) except KeyboardInterrupt: logger.error("❌ Process interrupted by user (Ctrl+C)") raise except Exception as e: logger.error(f"❌ Critical error in quantisation workflow: {e}") logger.error("Full traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") raise else: return results def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]: """Setup environment and prepare model for quantisation. Returns: Tuple of (model_source, f16_model_path, imatrix_path, output_repo). """ model_source = self.url_parser.parse(url) self._print_model_info(model_source) self.models_dir.mkdir(parents=True, exist_ok=True) f16_model_path = self.model_manager.prepare_model(model_source) imatrix_path = None if self.use_imatrix: logger.info("Checking for importance matrix (imatrix)...") imatrix_path = self.imatrix_manager.find_imatrix( self.models_dir / model_source.model_name ) output_repo = ( f"{self.uploader.get_username()}/" f"{model_source.original_author}-{model_source.model_name}-GGUF" ) return model_source, f16_model_path, imatrix_path, output_repo def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None: """Create initial repository with planned quantisations.""" logger.info("Creating initial README with planned quantisations...") quantisation_types = self.get_quantisation_types() planned_results = { qt: QuantisationResult(quantisation_type=qt, success=False, status="planned") for qt in quantisation_types } readme_path = self.readme_generator.generate( model_source, planned_results, self.models_dir, output_repo ) if not self.no_upload: logger.info("Creating repository with planned quantisations...") self.uploader.upload_readme(output_repo, readme_path) else: logger.info("Skipping repository creation (--no-upload specified)") def _execute_quantisations( self, model_source: ModelSource, f16_model_path: Path, imatrix_path: Path | None, output_repo: str, ) -> dict[QuantisationType, QuantisationResult]: """Execute all quantisation types with parallel uploads. Returns: dict[QuantisationType, QuantisationResult]: Quantisation results for each type. """ results: dict[QuantisationType, QuantisationResult] = {} quantisation_types = self.get_quantisation_types() types_list = [qt.value for qt in quantisation_types] logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}") # Process with parallel uploads - quantise sequentially but upload in background upload_futures = [] with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor: for i, quant_type in enumerate(quantisation_types, 1): logger.info( f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}" ) logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}") logger.debug(f"DEBUG: Current type: {quant_type.value}") logger.debug(f"DEBUG: Results so far: {len(results)} completed") try: result = self._process_single_quantisation( quant_type, model_source, f16_model_path, imatrix_path, output_repo, results, upload_executor, upload_futures, ) results[quant_type] = result logger.debug(f"DEBUG: Quantisation {quant_type.value} completed") # Force cleanup between quantisations gc.collect() logger.debug("DEBUG: Garbage collection completed") except Exception as e: logger.error(f"❌ Critical error processing {quant_type.value}: {e}") logger.error("Exception traceback:") for line in traceback.format_exc().splitlines(): logger.error(f" {line}") results[quant_type] = QuantisationResult( quantisation_type=quant_type, success=False, status="failed", error_message=str(e), ) # Force cleanup after error gc.collect() # Wait for all uploads to complete before returning self._wait_for_uploads(upload_futures) return results def _process_single_quantisation( self, quant_type: QuantisationType, model_source: ModelSource, f16_model_path: Path, imatrix_path: Path | None, output_repo: str, results: dict[QuantisationType, QuantisationResult], upload_executor: ThreadPoolExecutor, upload_futures: list, ) -> QuantisationResult: """Process a single quantisation type. Returns: QuantisationResult: Result of the quantisation attempt. """ try: logger.info(f"Starting {quant_type.value} quantisation...") logger.debug(f"DEBUG: Getting config for {quant_type.value}") config = QUANTISATION_CONFIGS[quant_type] logger.debug(f"DEBUG: Config loaded: {config.name}") # Update status to processing logger.debug("DEBUG: Creating initial quantisation result") result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "processing" results[quant_type] = result logger.debug("DEBUG: Updating README status") self._update_readme_status(model_source, results, output_repo) # Perform quantisation logger.debug("DEBUG: Creating quantisation context") context = QuantisationContext( f16_model_path=f16_model_path, model_source=model_source, config=config, models_dir=self.models_dir, imatrix_path=imatrix_path, ) logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}") logger.debug(f"DEBUG: imatrix path: {imatrix_path}") logger.debug("DEBUG: Calling quantisation engine...") result = self.quantisation_engine.quantise(context) logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}") self._handle_quantisation_result( result, quant_type, model_source, results, output_repo, upload_executor, upload_futures, ) except Exception as e: return self._handle_quantisation_error( e, quant_type, model_source, results, output_repo ) else: return result def _process_single_quantisation_sequential( self, quant_type: QuantisationType, model_source: ModelSource, f16_model_path: Path, imatrix_path: Path | None, output_repo: str, results: dict[QuantisationType, QuantisationResult], ) -> QuantisationResult: """Process a single quantisation type sequentially with immediate upload. Returns: QuantisationResult: Result of the quantisation attempt. """ # Force cleanup before starting new quantisation gc.collect() # Log system state before quantisation process = psutil.Process() logger.debug(f"DEBUG: === System state before {quant_type.value} ===") logger.debug(f"DEBUG: Process alive: {process.is_running()}") logger.debug(f"DEBUG: PID: {process.pid}") logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB") logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%") logger.debug(f"DEBUG: Threads: {process.num_threads()}") logger.debug(f"DEBUG: Open files: {len(process.open_files())}") try: logger.info(f"Starting {quant_type.value} quantisation...") logger.debug(f"DEBUG: Getting config for {quant_type.value}") config = QUANTISATION_CONFIGS[quant_type] logger.debug(f"DEBUG: Config loaded: {config.name}") # Update status to processing logger.debug("DEBUG: Creating initial quantisation result") result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "processing" results[quant_type] = result logger.debug("DEBUG: Updating README status") self._update_readme_status(model_source, results, output_repo) # Perform quantisation logger.debug("DEBUG: Creating quantisation context") context = QuantisationContext( f16_model_path=f16_model_path, model_source=model_source, config=config, models_dir=self.models_dir, imatrix_path=imatrix_path, ) logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}") logger.debug(f"DEBUG: imatrix path: {imatrix_path}") logger.debug("DEBUG: Calling quantisation engine...") result = self.quantisation_engine.quantise(context) logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}") if result.success and result.file_path: # Upload immediately (if not in no-upload mode) if not self.no_upload: logger.info(f"Uploading {quant_type.value}...") try: self.uploader.upload_model_file(output_repo, result.file_path) logger.info(f"Upload of {quant_type.value} completed successfully") # Clean up file after successful upload logger.info(f"Removing {result.file_path.name} to save disk space...") result.file_path.unlink() result.status = "completed" self._update_readme_status(model_source, results, output_repo) except Exception as upload_error: logger.error(f"Failed to upload {quant_type.value}: {upload_error}") result.status = "failed" result.error_message = str(upload_error) self._update_readme_status(model_source, results, output_repo) # Keep file if upload failed else: # No upload mode - just mark as completed result.status = "completed" logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)") else: result.status = "failed" self._update_readme_status(model_source, results, output_repo) except Exception as e: logger.error(f"Error processing {quant_type.value}: {e}") result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "failed" result.error_message = str(e) try: self._update_readme_status(model_source, results, output_repo) except Exception as readme_error: logger.error(f"Failed to update README after error: {readme_error}") # Force cleanup after error gc.collect() return result else: # Force cleanup after quantisation gc.collect() return result def _handle_quantisation_result( self, result: QuantisationResult, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, upload_executor: ThreadPoolExecutor, upload_futures: list, ) -> None: """Handle successful or failed quantisation result.""" if result.success and result.file_path: quant_str = getattr(result.quantisation_type, "value", result.quantisation_type) logger.info(f"Starting parallel upload of {quant_str}...") upload_future = upload_executor.submit( self._upload_and_cleanup, output_repo, result.file_path, quant_type, model_source, results, ) upload_futures.append(upload_future) result.file_path = None # Mark as being uploaded result.status = "uploading" else: result.status = "failed" self._update_readme_status(model_source, results, output_repo) def _handle_quantisation_error( self, error: Exception, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> QuantisationResult: """Handle quantisation processing error. Returns: QuantisationResult: Failed quantisation result with error information. """ logger.error(f"Error processing {quant_type.value}: {error}") result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "failed" result.error_message = str(error) try: self._update_readme_status(model_source, results, output_repo) except Exception as readme_error: logger.error(f"Failed to update README after error: {readme_error}") return result def _update_readme_status( self, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> None: """Update README with current quantisation status.""" if not self.no_upload: updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) def _wait_for_uploads(self, upload_futures: list) -> None: """Wait for all parallel uploads to complete.""" logger.info("Waiting for any remaining uploads to complete...") for future in upload_futures: try: future.result(timeout=300) # 5 minute timeout per upload except Exception as e: logger.warning(f"Upload error: {e}") def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None: """Clean up temporary files after processing.""" if f16_model_path.exists(): logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...") f16_model_path.unlink() if not model_source.is_gguf_repo: self._cleanup_original_model(model_source) def _cleanup_original_model(self, model_source: ModelSource) -> None: """Clean up original safetensors/PyTorch files after successful conversion.""" model_dir = self.models_dir / model_source.model_name pytorch_files = list(model_dir.glob("pytorch_model*.bin")) if pytorch_files: logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...") for file in pytorch_files: file.unlink() logger.info("Keeping config files, tokeniser, and metadata for reference") def _upload_and_cleanup( self, output_repo: str, file_path: Path, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], ) -> None: """Upload file and clean up (runs in background thread).""" try: logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})") self.uploader.upload_model_file(output_repo, file_path) logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully") logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...") file_path.unlink() results[quant_type].status = "completed" updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete") except Exception as e: logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}") results[quant_type].status = "failed" results[quant_type].error_message = str(e) try: updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) except Exception as readme_error: logger.error( f"[PARALLEL] Failed to update README after upload error: {readme_error}" ) # Don't re-raise - let other uploads continue def _print_model_info(self, model_source: ModelSource) -> None: """Print model information.""" logger.info(f"Source URL: {model_source.url}") logger.info(f"Source model: {model_source.source_model}") logger.info(f"Original author: {model_source.original_author}") logger.info(f"Model name: {model_source.model_name}") logger.info(f"Your HF username: {self.uploader.get_username()}") logger.info(f"Working directory: {self.work_dir}") def _print_completion_summary( self, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> None: """Print completion summary.""" successful_results = [r for r in results.values() if r.success] if successful_results: logger.info("Complete! Your quantised models are available at:") logger.info(f" https://huggingface.co/{output_repo}") logger.info("Model info:") logger.info(f" - Source URL: {model_source.url}") logger.info(f" - Original: {model_source.source_model}") logger.info( " - Method: " f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}" ) logger.info(f" - Quantised: {output_repo}") for result in successful_results: if result.file_size: filename = ( f"{model_source.original_author}-{model_source.model_name}-" f"{result.quantisation_type}.gguf" ) logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})") else: logger.error( "All quantisations failed - repository created with documentation " "but no model files" ) logger.error(f" Repository: https://huggingface.co/{output_repo}")