"""Quantisation operations service. Provides modular quantisation engine, model management, and upload capabilities for GGUF model processing. Consolidates quantisation logic from various tools into reusable components following SOLID principles. """ from __future__ import annotations import shutil import subprocess from typing import TYPE_CHECKING from helpers.logger import logger from helpers.models.quantisation import ( ModelSource, QuantisationContext, QuantisationResult, QuantisationType, ) from helpers.services.filesystem import FilesystemService if TYPE_CHECKING: from pathlib import Path from helpers.models.quantisation import LlamaCppEnvironment from helpers.services.llama_cpp import EnvironmentManager class QuantisationEngine: """Handles the actual quantisation process with configurable methods. Provides flexible quantisation execution supporting multiple tensor precision configurations, importance matrices, and fallback strategies. Encapsulates llama-quantize binary interactions with real-time output. """ def __init__(self) -> None: """Initialise quantisation engine.""" self.fs = FilesystemService() def quantise(self, context: QuantisationContext) -> QuantisationResult: """Perform quantisation using the specified configuration. Executes quantisation with primary and fallback methods, handling tensor-specific precision overrides and importance matrix guidance. Returns: QuantisationResult with success status and file information. """ logger.info( f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..." ) output_path = context.get_output_path() logger.info(f"🎯 Attempting {context.config.name} quantisation...") logger.info(f"📝 Source: {context.f16_model_path}") logger.info(f"📝 Target: {output_path}") # Try primary method if self._try_quantisation_method( context, output_path, context.config.tensor_types, "method 1" ): return self._create_success_result(context.config.name, output_path, "method 1") # Try fallback methods for i, fallback_method in enumerate(context.config.fallback_methods, 2): method_name = f"method {i}" if self._try_quantisation_method(context, output_path, fallback_method, method_name): return self._create_success_result(context.config.name, output_path, method_name) logger.error("All %s quantisation methods failed", context.config.name) return QuantisationResult( quantisation_type=QuantisationType(context.config.name), success=False, error_message="All quantisation methods failed", ) def _try_quantisation_method( self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str], method_name: str, ) -> bool: """Try a specific quantisation method with real-time output. Builds and executes llama-quantize command with appropriate parameters, streaming output for progress monitoring. Returns: True if quantisation successful, False otherwise. """ logger.info(f"🔍 Trying {method_name}...") cmd = self._build_quantisation_command(context, output_path, tensor_config) return self._execute_quantisation_command(cmd, method_name) def _build_quantisation_command( self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str] ) -> list[str]: """Build quantisation command with all required parameters. Returns: List of command arguments. """ cmd = [str(context.llama_env.quantise_binary)] # Add imatrix if available if context.imatrix_path and context.imatrix_path.exists(): cmd.extend(["--imatrix", str(context.imatrix_path)]) logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}") # Add tensor type arguments self._add_tensor_type_arguments(cmd, tensor_config) cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant]) return cmd def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None: """Add tensor type arguments to command.""" if not tensor_config: return for tensor_name, quant_type in tensor_config.items(): if tensor_name.startswith(("token-embedding-type", "output-tensor-type")): cmd.extend([f"--{tensor_name}", quant_type]) else: cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"]) def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool: """Execute quantisation command with real-time output. Returns: True if quantisation successful, False otherwise. """ logger.info(f"💻 Running: {' '.join(cmd)}") logger.info("⏳ Quantisation in progress... (this may take several minutes)") try: process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=1, ) self._stream_quantisation_output(process) return_code = process.poll() if return_code == 0: logger.info(f"✅ {method_name} quantisation successful!") return True except Exception as e: logger.info(f"❌ {method_name} failed with exception: {e}") return False else: logger.info(f"❌ {method_name} failed with return code {return_code}") return False def _stream_quantisation_output(self, process: subprocess.Popen) -> None: """Stream quantisation output in real-time.""" while True: if process.stdout is not None: output = process.stdout.readline() else: break if not output and process.poll() is not None: break if output: logger.info(f"📊 {output.strip()}") def _create_success_result( self, quant_type: str, output_path: Path, method_used: str ) -> QuantisationResult: """Create successful quantisation result with file metadata. Returns: QuantisationResult with file path and size information. """ file_size = self.fs.get_file_size(output_path) return QuantisationResult( quantisation_type=QuantisationType(quant_type), success=True, file_path=output_path, file_size=file_size, method_used=method_used, ) class ModelManager: """Handles model downloading and preparation for quantisation. Manages both GGUF repository downloads and HuggingFace model conversions, providing unified interface for model acquisition and preparation. """ def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None: """Initialise model manager with storage and environment configuration. Sets up model storage directory and links to environment manager for conversion script access and llama.cpp tool discovery. """ self.models_dir = models_dir self.environment_manager = environment_manager self.fs = FilesystemService() def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path: """Prepare model for quantisation and return F16 model path. Handles both GGUF repository downloads and regular HuggingFace model conversion workflows with automatic format detection. Returns: Path to F16 GGUF model ready for quantisation. """ model_dir = self.models_dir / model_source.model_name if model_source.is_gguf_repo: return self._handle_gguf_repo(model_source, model_dir) return self._handle_regular_repo(model_source, model_dir, llama_env) def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path: """Handle GGUF repository download with pattern matching. Downloads GGUF files matching specified patterns, prioritising multi-part files and F16 variants. Returns: Path to downloaded or existing GGUF file. """ logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}") logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*") f16_model = model_dir / f"{model_source.model_name}-f16.gguf" if f16_model.exists(): logger.info(f"✅ Found existing F16 file: {f16_model.name}") return f16_model # Check for existing GGUF files model_dir.mkdir(parents=True, exist_ok=True) existing_gguf = self.fs.find_gguf_files(model_dir) if existing_gguf: logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}") return existing_gguf[0] # Download with patterns downloaded_file = self._download_gguf_with_patterns( model_source.source_model, model_source.gguf_file_pattern, model_dir ) if downloaded_file: # Handle multi-part files if "00001-of-" in downloaded_file.name: return downloaded_file if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name: base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace( "-00003-of-", "-00001-of-" ) first_part = downloaded_file.parent / base_name if first_part.exists(): logger.info(f"🔄 Using first part: {first_part.name}") return first_part # Rename single file to standard name downloaded_file.rename(f16_model) return f16_model # Fallback to regular conversion logger.info("💡 Falling back to downloading full repository and converting...") return self._handle_regular_repo( ModelSource(**{**model_source.dict(), "is_gguf_repo": False}), model_dir, None, ) def _download_gguf_with_patterns( self, source_model: str, pattern: str | None, model_dir: Path ) -> Path | None: """Download GGUF file using various pattern strategies. Tries multiple pattern variations to find and download appropriate GGUF files, handling timeouts and temporary directories. Returns: Path to downloaded file, or None if all patterns fail. """ if pattern: patterns = [ f"*{pattern}*", f"*{pattern.lower()}*", f"*{pattern.upper()}*", "*f16*", "*F16*", "*fp16*", ] else: patterns = ["*f16*", "*F16*", "*fp16*"] temp_dir = model_dir / "gguf_temp" for search_pattern in patterns: logger.info(f"🔍 Trying pattern: {search_pattern}") temp_dir.mkdir(exist_ok=True) try: subprocess.run( [ "timeout", "300", "huggingface-cli", "download", source_model, "--include", search_pattern, "--local-dir", str(temp_dir), ], check=True, capture_output=True, ) # Find downloaded GGUF files gguf_files = self.fs.find_gguf_files(temp_dir, pattern) if gguf_files: found_file = gguf_files[0] logger.info(f"✅ Found GGUF file: {found_file.name}") # Move to parent directory final_path = model_dir / found_file.name shutil.move(str(found_file), str(final_path)) shutil.rmtree(temp_dir) return final_path except subprocess.CalledProcessError: logger.info(f"⚠️ Pattern {search_pattern} failed or timed out") continue finally: if temp_dir.exists(): shutil.rmtree(temp_dir, ignore_errors=True) return None def _handle_regular_repo( self, model_source: ModelSource, model_dir: Path, llama_env: LlamaCppEnvironment | None, ) -> Path: """Handle regular HuggingFace repository conversion. Downloads full model repository and converts to F16 GGUF format using llama.cpp conversion scripts. Returns: Path to converted F16 GGUF model. """ logger.info(f"⬇️ Downloading source model: {model_source.source_model}") if not model_dir.exists(): subprocess.run( [ "huggingface-cli", "download", model_source.source_model, "--local-dir", str(model_dir), ], check=True, ) else: logger.info("✅ Model already downloaded") logger.info("🔄 Converting to GGUF F16 format...") f16_model = model_dir / f"{model_source.model_name}-f16.gguf" if not f16_model.exists(): if not llama_env: llama_env = self.environment_manager.setup() # Ensure conversion script is available if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists(): logger.info("Getting conversion script from llama.cpp repository...") llama_env = self.environment_manager.setup_repository() subprocess.run( [ *llama_env.convert_script.split(), str(model_dir), "--outtype", "f16", "--outfile", str(f16_model), ], check=True, ) else: logger.info("✅ F16 model already exists") return f16_model class HuggingFaceUploader: """Handles uploading models and documentation to HuggingFace. Provides methods for repository creation, file uploads, and README updates with proper error handling and retry logic. """ @staticmethod def get_username() -> str: """Get authenticated HuggingFace username. Returns: HuggingFace username from CLI authentication. Raises: RuntimeError: If not authenticated. """ try: result = subprocess.run( ["huggingface-cli", "whoami"], capture_output=True, text=True, check=True, ) return result.stdout.strip() except (subprocess.CalledProcessError, FileNotFoundError) as err: msg = "Please log in to HuggingFace first: huggingface-cli login" raise RuntimeError(msg) from err def upload_readme(self, output_repo: str, readme_path: Path) -> None: """Upload or update README file to repository. Creates repository if needed, handles existing repository updates. """ logger.info("Uploading README...") try: subprocess.run( [ "huggingface-cli", "upload", output_repo, str(readme_path), "README.md", "--create", ], check=True, capture_output=True, ) logger.info("README uploaded") except subprocess.CalledProcessError: # Repository exists, update without --create subprocess.run( [ "huggingface-cli", "upload", output_repo, str(readme_path), "README.md", ], check=True, ) logger.info("README updated") def upload_model_file(self, output_repo: str, model_path: Path) -> None: """Upload model file to repository. Uploads GGUF model file to specified repository path. """ logger.info(f"Uploading {model_path.name}...") subprocess.run( [ "huggingface-cli", "upload", output_repo, str(model_path), model_path.name, ], check=True, ) logger.info(f"{model_path.name} uploaded")