"""Quantisation orchestration service. High-level orchestration of the complete quantisation workflow from model acquisition through processing to upload. Manages parallel processing, status tracking, and cleanup operations for efficient resource utilisation. """ from __future__ import annotations from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass, field from pathlib import Path from typing import Any from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES from helpers.logger import logger from helpers.models.quantisation import ( ModelSource, QuantisationContext, QuantisationResult, QuantisationType, ) from helpers.services.huggingface import ReadmeGenerator from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine from helpers.utils.tensor_mapping import URLParser @dataclass(slots=True) class QuantisationOrchestrator: """Orchestrates the complete quantisation workflow. Uses dataclass with slots for efficient memory usage and dependency injection for modular service interaction following SOLID principles. """ work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work") use_imatrix: bool = True imatrix_base: str = "Q4_K_M" no_upload: bool = False # Service dependencies with factory defaults url_parser: URLParser = field(default_factory=URLParser) quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine) imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator) readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator) uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader) # Computed properties models_dir: Path = field(init=False) environment_manager: EnvironmentManager = field(init=False) model_manager: ModelManager = field(init=False) def __post_init__(self) -> None: """Initialise computed properties after dataclass construction.""" self.models_dir = self.work_dir / "models" self.environment_manager = EnvironmentManager(self.work_dir) self.model_manager = ModelManager(self.models_dir, self.environment_manager) def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]: """Main quantisation workflow orchestrating model processing from URL to upload. Returns: dict[QuantisationType, QuantisationResult]: Quantisation results for each type. """ logger.info("Starting Bartowski quantisation process...") # Setup and preparation model_source, llama_env, f16_model_path, imatrix_path, output_repo = ( self._setup_environment(url) ) # Create initial repository self._create_initial_repository(model_source, output_repo) # Execute all quantisations results = self._execute_quantisations( model_source, llama_env, f16_model_path, imatrix_path, output_repo ) # Cleanup self._cleanup_files(f16_model_path, model_source) self._print_completion_summary(model_source, results, output_repo) return results def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]: """Setup environment and prepare model for quantisation. Returns: Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo). """ model_source = self.url_parser.parse(url) self._print_model_info(model_source) self.models_dir.mkdir(parents=True, exist_ok=True) llama_env = self.environment_manager.setup() f16_model_path = self.model_manager.prepare_model(model_source, llama_env) imatrix_path = None if self.use_imatrix: logger.info("Generating importance matrix (imatrix)...") imatrix_path = self.imatrix_generator.generate_imatrix( f16_model_path, llama_env, self.models_dir / model_source.model_name ) output_repo = ( f"{self.uploader.get_username()}/" f"{model_source.original_author}-{model_source.model_name}-GGUF" ) return model_source, llama_env, f16_model_path, imatrix_path, output_repo def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None: """Create initial repository with planned quantisations.""" logger.info("Creating initial README with planned quantisations...") planned_results = { qt: QuantisationResult(quantisation_type=qt, success=False, status="planned") for qt in SUPPORTED_QUANTISATION_TYPES } readme_path = self.readme_generator.generate( model_source, planned_results, self.models_dir, output_repo ) if not self.no_upload: logger.info("Creating repository with planned quantisations...") self.uploader.upload_readme(output_repo, readme_path) else: logger.info("Skipping repository creation (--no-upload specified)") def _execute_quantisations( self, model_source: ModelSource, llama_env: Any, f16_model_path: Path, imatrix_path: Path | None, output_repo: str, ) -> dict[QuantisationType, QuantisationResult]: """Execute all quantisation types with parallel uploads. Returns: dict[QuantisationType, QuantisationResult]: Quantisation results for each type. """ results: dict[QuantisationType, QuantisationResult] = {} upload_futures: list[Future[None]] = [] with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor: for quant_type in SUPPORTED_QUANTISATION_TYPES: result = self._process_single_quantisation( quant_type, model_source, llama_env, f16_model_path, imatrix_path, output_repo, results, upload_executor, upload_futures, ) results[quant_type] = result self._wait_for_uploads(upload_futures) return results def _process_single_quantisation( self, quant_type: QuantisationType, model_source: ModelSource, llama_env: Any, f16_model_path: Path, imatrix_path: Path | None, output_repo: str, results: dict[QuantisationType, QuantisationResult], upload_executor: ThreadPoolExecutor, upload_futures: list, ) -> QuantisationResult: """Process a single quantisation type. Returns: QuantisationResult: Result of the quantisation attempt. """ try: logger.info(f"Starting {quant_type.value} quantisation...") config = QUANTISATION_CONFIGS[quant_type] # Update status to processing result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "processing" results[quant_type] = result self._update_readme_status(model_source, results, output_repo) # Perform quantisation context = QuantisationContext( f16_model_path=f16_model_path, model_source=model_source, config=config, llama_env=llama_env, models_dir=self.models_dir, imatrix_path=imatrix_path, base_quant=self.imatrix_base, ) result = self.quantisation_engine.quantise(context) self._handle_quantisation_result( result, quant_type, model_source, results, output_repo, upload_executor, upload_futures, ) except Exception as e: return self._handle_quantisation_error( e, quant_type, model_source, results, output_repo ) else: return result def _handle_quantisation_result( self, result: QuantisationResult, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, upload_executor: ThreadPoolExecutor, upload_futures: list, ) -> None: """Handle successful or failed quantisation result.""" if result.success and result.file_path: quant_str = getattr(result.quantisation_type, "value", result.quantisation_type) logger.info(f"Starting parallel upload of {quant_str}...") upload_future = upload_executor.submit( self._upload_and_cleanup, output_repo, result.file_path, quant_type, model_source, results, ) upload_futures.append(upload_future) result.file_path = None # Mark as being uploaded result.status = "uploading" else: result.status = "failed" self._update_readme_status(model_source, results, output_repo) def _handle_quantisation_error( self, error: Exception, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> QuantisationResult: """Handle quantisation processing error. Returns: QuantisationResult: Failed quantisation result with error information. """ logger.error(f"Error processing {quant_type.value}: {error}") result = QuantisationResult(quantisation_type=quant_type, success=False) result.status = "failed" result.error_message = str(error) try: self._update_readme_status(model_source, results, output_repo) except Exception as readme_error: logger.error(f"Failed to update README after error: {readme_error}") return result def _update_readme_status( self, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> None: """Update README with current quantisation status.""" if not self.no_upload: updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) def _wait_for_uploads(self, upload_futures: list) -> None: """Wait for all parallel uploads to complete.""" logger.info("Waiting for any remaining uploads to complete...") for future in upload_futures: try: future.result(timeout=300) # 5 minute timeout per upload except Exception as e: logger.warning(f"Upload error: {e}") def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None: """Clean up temporary files after processing.""" if f16_model_path.exists(): logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...") f16_model_path.unlink() if not model_source.is_gguf_repo: self._cleanup_original_model(model_source) def _cleanup_original_model(self, model_source: ModelSource) -> None: """Clean up original safetensors/PyTorch files after successful conversion.""" model_dir = self.models_dir / model_source.model_name pytorch_files = list(model_dir.glob("pytorch_model*.bin")) if pytorch_files: logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...") for file in pytorch_files: file.unlink() logger.info("Keeping config files, tokeniser, and metadata for reference") def _upload_and_cleanup( self, output_repo: str, file_path: Path, quant_type: QuantisationType, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], ) -> None: """Upload file and clean up (runs in background thread).""" try: logger.info(f"[PARALLEL] Uploading {quant_type}...") self.uploader.upload_model_file(output_repo, file_path) logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...") file_path.unlink() results[quant_type].status = "completed" updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete") except Exception as e: logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}") results[quant_type].status = "failed" results[quant_type].error_message = str(e) updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) self.uploader.upload_readme(output_repo, updated_readme_path) raise def _print_model_info(self, model_source: ModelSource) -> None: """Print model information.""" logger.info(f"Source URL: {model_source.url}") logger.info(f"Source model: {model_source.source_model}") logger.info(f"Original author: {model_source.original_author}") logger.info(f"Model name: {model_source.model_name}") logger.info(f"Your HF username: {self.uploader.get_username()}") logger.info(f"Working directory: {self.work_dir}") def _print_completion_summary( self, model_source: ModelSource, results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> None: """Print completion summary.""" successful_results = [r for r in results.values() if r.success] if successful_results: logger.info("Complete! Your quantised models are available at:") logger.info(f" https://huggingface.co/{output_repo}") logger.info("Model info:") logger.info(f" - Source URL: {model_source.url}") logger.info(f" - Original: {model_source.source_model}") logger.info( " - Method: " f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}" ) logger.info(f" - Quantised: {output_repo}") for result in successful_results: if result.file_size: filename = ( f"{model_source.original_author}-{model_source.model_name}-" f"{result.quantisation_type}.gguf" ) logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})") else: logger.error( "All quantisations failed - repository created with documentation " "but no model files" ) logger.error(f" Repository: https://huggingface.co/{output_repo}")