llm-gguf-tools/helpers/services/orchestrator.py

618 lines
26 KiB
Python

"""Quantisation orchestration service.
High-level orchestration of the complete quantisation workflow from model
acquisition through processing to upload. Manages parallel processing,
status tracking, and cleanup operations for efficient resource utilisation.
"""
from __future__ import annotations
import gc
import signal
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
import psutil
from helpers.config.quantisation_configs import (
DEFAULT_QUANTISATION_TYPES,
QUANTISATION_CONFIGS,
SUPPORTED_QUANTISATION_TYPES,
)
from helpers.logger import logger
from helpers.models.quantisation import (
ModelSource,
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.llama_cpp import IMatrixManager
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.tensor_mapping import URLParser
if TYPE_CHECKING:
from types import FrameType
@dataclass(slots=True)
class QuantisationOrchestrator:
"""Orchestrates the complete quantisation workflow.
Uses dataclass with slots for efficient memory usage and dependency injection
for modular service interaction following SOLID principles.
"""
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
use_imatrix: bool = True
no_upload: bool = False
custom_profiles: list[str] | None = None
# Service dependencies with factory defaults
url_parser: URLParser = field(default_factory=URLParser)
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
# Computed properties
models_dir: Path = field(init=False)
model_manager: ModelManager = field(init=False)
def __post_init__(self) -> None:
"""Initialise computed properties after dataclass construction."""
self.models_dir = self.work_dir / "models"
self.model_manager = ModelManager(self.models_dir)
# Set up signal handlers for graceful exit tracking
self._setup_signal_handlers()
def _setup_signal_handlers(self) -> None:
"""Set up signal handlers to catch unexpected exits."""
def signal_handler(signum: int, frame: FrameType | None) -> None:
logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
logger.error("Stack trace at signal:")
if frame:
for line in traceback.format_stack(frame):
logger.error(f" {line.strip()}")
logger.error("Exiting due to signal")
sys.exit(1)
# Handle common termination signals
for sig in [signal.SIGINT, signal.SIGTERM]:
signal.signal(sig, signal_handler)
def get_quantisation_types(self) -> list[QuantisationType]:
"""Get the quantisation types to use for this run.
Returns:
List of QuantisationType enums to process.
"""
if self.custom_profiles:
# Parse custom profiles from strings to QuantisationType
result = []
for profile_str in self.custom_profiles:
try:
profile = QuantisationType(profile_str.upper())
if profile in SUPPORTED_QUANTISATION_TYPES:
result.append(profile)
else:
logger.warning(f"Profile {profile_str} is not supported, skipping")
except ValueError:
logger.warning(f"Invalid profile {profile_str}, skipping")
return result or DEFAULT_QUANTISATION_TYPES
return DEFAULT_QUANTISATION_TYPES
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
"""Main quantisation workflow orchestrating model processing from URL to upload.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
Raises:
KeyboardInterrupt: If the user interrupts the quantisation process.
"""
logger.info("Starting Bartowski quantisation process...")
logger.debug(f"DEBUG: Input URL: {url}")
logger.debug(f"DEBUG: Working directory: {self.work_dir}")
logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
logger.debug(f"DEBUG: No upload: {self.no_upload}")
logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
try:
# Setup and preparation
logger.debug("DEBUG: Starting environment setup...")
model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
# Create initial repository
logger.debug("DEBUG: Creating initial repository...")
self._create_initial_repository(model_source, output_repo)
logger.debug("DEBUG: Initial repository created")
# Execute all quantisations
logger.debug("DEBUG: Starting quantisation execution...")
results = self._execute_quantisations(
model_source, f16_model_path, imatrix_path, output_repo
)
logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
# Cleanup
logger.debug("DEBUG: Starting cleanup...")
self._cleanup_files(f16_model_path, model_source)
logger.debug("DEBUG: Cleanup complete")
self._print_completion_summary(model_source, results, output_repo)
except KeyboardInterrupt:
logger.error("❌ Process interrupted by user (Ctrl+C)")
raise
except Exception as e:
logger.error(f"❌ Critical error in quantisation workflow: {e}")
logger.error("Full traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
raise
else:
return results
def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
"""Setup environment and prepare model for quantisation.
Returns:
Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
"""
model_source = self.url_parser.parse(url)
self._print_model_info(model_source)
self.models_dir.mkdir(parents=True, exist_ok=True)
f16_model_path = self.model_manager.prepare_model(model_source)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
imatrix_path = self.imatrix_manager.find_imatrix(
self.models_dir / model_source.model_name
)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
return model_source, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
"""Create initial repository with planned quantisations."""
logger.info("Creating initial README with planned quantisations...")
quantisation_types = self.get_quantisation_types()
planned_results = {
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
for qt in quantisation_types
}
readme_path = self.readme_generator.generate(
model_source, planned_results, self.models_dir, output_repo
)
if not self.no_upload:
logger.info("Creating repository with planned quantisations...")
self.uploader.upload_readme(output_repo, readme_path)
else:
logger.info("Skipping repository creation (--no-upload specified)")
def _execute_quantisations(
self,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Returns:
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
quantisation_types = self.get_quantisation_types()
types_list = [qt.value for qt in quantisation_types]
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
# Process with parallel uploads - quantise sequentially but upload in background
upload_futures = []
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
for i, quant_type in enumerate(quantisation_types, 1):
logger.info(
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
)
logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
logger.debug(f"DEBUG: Current type: {quant_type.value}")
logger.debug(f"DEBUG: Results so far: {len(results)} completed")
try:
result = self._process_single_quantisation(
quant_type,
model_source,
f16_model_path,
imatrix_path,
output_repo,
results,
upload_executor,
upload_futures,
)
results[quant_type] = result
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
# Force cleanup between quantisations
gc.collect()
logger.debug("DEBUG: Garbage collection completed")
except Exception as e:
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message=str(e),
)
# Force cleanup after error
gc.collect()
# Wait for all uploads to complete before returning
self._wait_for_uploads(upload_futures)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
self._handle_quantisation_result(
result,
quant_type,
model_source,
results,
output_repo,
upload_executor,
upload_futures,
)
except Exception as e:
return self._handle_quantisation_error(
e, quant_type, model_source, results, output_repo
)
else:
return result
def _process_single_quantisation_sequential(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
) -> QuantisationResult:
"""Process a single quantisation type sequentially with immediate upload.
Returns:
QuantisationResult: Result of the quantisation attempt.
"""
# Force cleanup before starting new quantisation
gc.collect()
# Log system state before quantisation
process = psutil.Process()
logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
logger.debug(f"DEBUG: Process alive: {process.is_running()}")
logger.debug(f"DEBUG: PID: {process.pid}")
logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
logger.debug(f"DEBUG: Threads: {process.num_threads()}")
logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
try:
logger.info(f"Starting {quant_type.value} quantisation...")
logger.debug(f"DEBUG: Getting config for {quant_type.value}")
config = QUANTISATION_CONFIGS[quant_type]
logger.debug(f"DEBUG: Config loaded: {config.name}")
# Update status to processing
logger.debug("DEBUG: Creating initial quantisation result")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
logger.debug("DEBUG: Updating README status")
self._update_readme_status(model_source, results, output_repo)
# Perform quantisation
logger.debug("DEBUG: Creating quantisation context")
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=self.models_dir,
imatrix_path=imatrix_path,
)
logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
logger.debug("DEBUG: Calling quantisation engine...")
result = self.quantisation_engine.quantise(context)
logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
if result.success and result.file_path:
# Upload immediately (if not in no-upload mode)
if not self.no_upload:
logger.info(f"Uploading {quant_type.value}...")
try:
self.uploader.upload_model_file(output_repo, result.file_path)
logger.info(f"Upload of {quant_type.value} completed successfully")
# Clean up file after successful upload
logger.info(f"Removing {result.file_path.name} to save disk space...")
result.file_path.unlink()
result.status = "completed"
self._update_readme_status(model_source, results, output_repo)
except Exception as upload_error:
logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
result.status = "failed"
result.error_message = str(upload_error)
self._update_readme_status(model_source, results, output_repo)
# Keep file if upload failed
else:
# No upload mode - just mark as completed
result.status = "completed"
logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
except Exception as e:
logger.error(f"Error processing {quant_type.value}: {e}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(e)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
# Force cleanup after error
gc.collect()
return result
else:
# Force cleanup after quantisation
gc.collect()
return result
def _handle_quantisation_result(
self,
result: QuantisationResult,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Handle successful or failed quantisation result."""
if result.success and result.file_path:
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
else:
result.status = "failed"
self._update_readme_status(model_source, results, output_repo)
def _handle_quantisation_error(
self,
error: Exception,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> QuantisationResult:
"""Handle quantisation processing error.
Returns:
QuantisationResult: Failed quantisation result with error information.
"""
logger.error(f"Error processing {quant_type.value}: {error}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(error)
try:
self._update_readme_status(model_source, results, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Update README with current quantisation status."""
if not self.no_upload:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
logger.info("Waiting for any remaining uploads to complete...")
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
except Exception as e:
logger.warning(f"Upload error: {e}")
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
"""Clean up temporary files after processing."""
if f16_model_path.exists():
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
f16_model_path.unlink()
if not model_source.is_gguf_repo:
self._cleanup_original_model(model_source)
def _cleanup_original_model(self, model_source: ModelSource) -> None:
"""Clean up original safetensors/PyTorch files after successful conversion."""
model_dir = self.models_dir / model_source.model_name
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
if pytorch_files:
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
for file in pytorch_files:
file.unlink()
logger.info("Keeping config files, tokeniser, and metadata for reference")
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
file_path.unlink()
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
)
# Don't re-raise - let other uploads continue
def _print_model_info(self, model_source: ModelSource) -> None:
"""Print model information."""
logger.info(f"Source URL: {model_source.url}")
logger.info(f"Source model: {model_source.source_model}")
logger.info(f"Original author: {model_source.original_author}")
logger.info(f"Model name: {model_source.model_name}")
logger.info(f"Your HF username: {self.uploader.get_username()}")
logger.info(f"Working directory: {self.work_dir}")
def _print_completion_summary(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
output_repo: str,
) -> None:
"""Print completion summary."""
successful_results = [r for r in results.values() if r.success]
if successful_results:
logger.info("Complete! Your quantised models are available at:")
logger.info(f" https://huggingface.co/{output_repo}")
logger.info("Model info:")
logger.info(f" - Source URL: {model_source.url}")
logger.info(f" - Original: {model_source.source_model}")
logger.info(
" - Method: "
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
)
logger.info(f" - Quantised: {output_repo}")
for result in successful_results:
if result.file_size:
filename = (
f"{model_source.original_author}-{model_source.model_name}-"
f"{result.quantisation_type}.gguf"
)
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
else:
logger.error(
"All quantisations failed - repository created with documentation "
"but no model files"
)
logger.error(f" Repository: https://huggingface.co/{output_repo}")