397 lines
15 KiB
Python
397 lines
15 KiB
Python
"""Quantisation orchestration service.
|
|
|
|
High-level orchestration of the complete quantisation workflow from model
|
|
acquisition through processing to upload. Manages parallel processing,
|
|
status tracking, and cleanup operations for efficient resource utilisation.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
|
|
from helpers.logger import logger
|
|
from helpers.models.quantisation import (
|
|
ModelSource,
|
|
QuantisationContext,
|
|
QuantisationResult,
|
|
QuantisationType,
|
|
)
|
|
from helpers.services.huggingface import ReadmeGenerator
|
|
from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
|
|
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
|
from helpers.utils.tensor_mapping import URLParser
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class QuantisationOrchestrator:
|
|
"""Orchestrates the complete quantisation workflow.
|
|
|
|
Uses dataclass with slots for efficient memory usage and dependency injection
|
|
for modular service interaction following SOLID principles.
|
|
"""
|
|
|
|
work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
|
|
use_imatrix: bool = True
|
|
imatrix_base: str = "Q4_K_M"
|
|
no_upload: bool = False
|
|
|
|
# Service dependencies with factory defaults
|
|
url_parser: URLParser = field(default_factory=URLParser)
|
|
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
|
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
|
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
|
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
|
|
|
# Computed properties
|
|
models_dir: Path = field(init=False)
|
|
environment_manager: EnvironmentManager = field(init=False)
|
|
model_manager: ModelManager = field(init=False)
|
|
|
|
def __post_init__(self) -> None:
|
|
"""Initialise computed properties after dataclass construction."""
|
|
self.models_dir = self.work_dir / "models"
|
|
self.environment_manager = EnvironmentManager(self.work_dir)
|
|
self.model_manager = ModelManager(self.models_dir, self.environment_manager)
|
|
|
|
def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
|
|
"""Main quantisation workflow orchestrating model processing from URL to upload.
|
|
|
|
Returns:
|
|
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
|
"""
|
|
logger.info("Starting Bartowski quantisation process...")
|
|
|
|
# Setup and preparation
|
|
model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
|
|
self._setup_environment(url)
|
|
)
|
|
|
|
# Create initial repository
|
|
self._create_initial_repository(model_source, output_repo)
|
|
|
|
# Execute all quantisations
|
|
results = self._execute_quantisations(
|
|
model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
|
)
|
|
|
|
# Cleanup
|
|
self._cleanup_files(f16_model_path, model_source)
|
|
|
|
self._print_completion_summary(model_source, results, output_repo)
|
|
return results
|
|
|
|
def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
|
|
"""Setup environment and prepare model for quantisation.
|
|
|
|
Returns:
|
|
Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
|
|
"""
|
|
model_source = self.url_parser.parse(url)
|
|
self._print_model_info(model_source)
|
|
|
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
|
llama_env = self.environment_manager.setup()
|
|
|
|
f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
|
|
|
|
imatrix_path = None
|
|
if self.use_imatrix:
|
|
logger.info("Generating importance matrix (imatrix)...")
|
|
imatrix_path = self.imatrix_generator.generate_imatrix(
|
|
f16_model_path, llama_env, self.models_dir / model_source.model_name
|
|
)
|
|
|
|
output_repo = (
|
|
f"{self.uploader.get_username()}/"
|
|
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
|
)
|
|
|
|
return model_source, llama_env, f16_model_path, imatrix_path, output_repo
|
|
|
|
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
|
"""Create initial repository with planned quantisations."""
|
|
logger.info("Creating initial README with planned quantisations...")
|
|
planned_results = {
|
|
qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
|
|
for qt in SUPPORTED_QUANTISATION_TYPES
|
|
}
|
|
readme_path = self.readme_generator.generate(
|
|
model_source, planned_results, self.models_dir, output_repo
|
|
)
|
|
|
|
if not self.no_upload:
|
|
logger.info("Creating repository with planned quantisations...")
|
|
self.uploader.upload_readme(output_repo, readme_path)
|
|
else:
|
|
logger.info("Skipping repository creation (--no-upload specified)")
|
|
|
|
def _execute_quantisations(
|
|
self,
|
|
model_source: ModelSource,
|
|
llama_env: Any,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
) -> dict[QuantisationType, QuantisationResult]:
|
|
"""Execute all quantisation types with parallel uploads.
|
|
|
|
Returns:
|
|
dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
|
|
"""
|
|
results: dict[QuantisationType, QuantisationResult] = {}
|
|
upload_futures: list[Future[None]] = []
|
|
|
|
with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
|
|
for quant_type in SUPPORTED_QUANTISATION_TYPES:
|
|
result = self._process_single_quantisation(
|
|
quant_type,
|
|
model_source,
|
|
llama_env,
|
|
f16_model_path,
|
|
imatrix_path,
|
|
output_repo,
|
|
results,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
results[quant_type] = result
|
|
|
|
self._wait_for_uploads(upload_futures)
|
|
|
|
return results
|
|
|
|
def _process_single_quantisation(
|
|
self,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
llama_env: Any,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> QuantisationResult:
|
|
"""Process a single quantisation type.
|
|
|
|
Returns:
|
|
QuantisationResult: Result of the quantisation attempt.
|
|
"""
|
|
try:
|
|
logger.info(f"Starting {quant_type.value} quantisation...")
|
|
config = QUANTISATION_CONFIGS[quant_type]
|
|
|
|
# Update status to processing
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "processing"
|
|
results[quant_type] = result
|
|
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
# Perform quantisation
|
|
context = QuantisationContext(
|
|
f16_model_path=f16_model_path,
|
|
model_source=model_source,
|
|
config=config,
|
|
llama_env=llama_env,
|
|
models_dir=self.models_dir,
|
|
imatrix_path=imatrix_path,
|
|
base_quant=self.imatrix_base,
|
|
)
|
|
result = self.quantisation_engine.quantise(context)
|
|
|
|
self._handle_quantisation_result(
|
|
result,
|
|
quant_type,
|
|
model_source,
|
|
results,
|
|
output_repo,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
except Exception as e:
|
|
return self._handle_quantisation_error(
|
|
e, quant_type, model_source, results, output_repo
|
|
)
|
|
else:
|
|
return result
|
|
|
|
def _handle_quantisation_result(
|
|
self,
|
|
result: QuantisationResult,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> None:
|
|
"""Handle successful or failed quantisation result."""
|
|
if result.success and result.file_path:
|
|
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
|
logger.info(f"Starting parallel upload of {quant_str}...")
|
|
upload_future = upload_executor.submit(
|
|
self._upload_and_cleanup,
|
|
output_repo,
|
|
result.file_path,
|
|
quant_type,
|
|
model_source,
|
|
results,
|
|
)
|
|
upload_futures.append(upload_future)
|
|
result.file_path = None # Mark as being uploaded
|
|
result.status = "uploading"
|
|
else:
|
|
result.status = "failed"
|
|
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
|
|
def _handle_quantisation_error(
|
|
self,
|
|
error: Exception,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> QuantisationResult:
|
|
"""Handle quantisation processing error.
|
|
|
|
Returns:
|
|
QuantisationResult: Failed quantisation result with error information.
|
|
"""
|
|
logger.error(f"Error processing {quant_type.value}: {error}")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "failed"
|
|
result.error_message = str(error)
|
|
|
|
try:
|
|
self._update_readme_status(model_source, results, output_repo)
|
|
except Exception as readme_error:
|
|
logger.error(f"Failed to update README after error: {readme_error}")
|
|
|
|
return result
|
|
|
|
def _update_readme_status(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Update README with current quantisation status."""
|
|
if not self.no_upload:
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
def _wait_for_uploads(self, upload_futures: list) -> None:
|
|
"""Wait for all parallel uploads to complete."""
|
|
logger.info("Waiting for any remaining uploads to complete...")
|
|
for future in upload_futures:
|
|
try:
|
|
future.result(timeout=300) # 5 minute timeout per upload
|
|
except Exception as e:
|
|
logger.warning(f"Upload error: {e}")
|
|
|
|
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
|
"""Clean up temporary files after processing."""
|
|
if f16_model_path.exists():
|
|
logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
|
|
f16_model_path.unlink()
|
|
|
|
if not model_source.is_gguf_repo:
|
|
self._cleanup_original_model(model_source)
|
|
|
|
def _cleanup_original_model(self, model_source: ModelSource) -> None:
|
|
"""Clean up original safetensors/PyTorch files after successful conversion."""
|
|
model_dir = self.models_dir / model_source.model_name
|
|
|
|
pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
|
|
if pytorch_files:
|
|
logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
|
|
for file in pytorch_files:
|
|
file.unlink()
|
|
|
|
logger.info("Keeping config files, tokeniser, and metadata for reference")
|
|
|
|
def _upload_and_cleanup(
|
|
self,
|
|
output_repo: str,
|
|
file_path: Path,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
) -> None:
|
|
"""Upload file and clean up (runs in background thread)."""
|
|
try:
|
|
logger.info(f"[PARALLEL] Uploading {quant_type}...")
|
|
self.uploader.upload_model_file(output_repo, file_path)
|
|
|
|
logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
|
|
file_path.unlink()
|
|
|
|
results[quant_type].status = "completed"
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
|
except Exception as e:
|
|
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
|
results[quant_type].status = "failed"
|
|
results[quant_type].error_message = str(e)
|
|
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, self.models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
raise
|
|
|
|
def _print_model_info(self, model_source: ModelSource) -> None:
|
|
"""Print model information."""
|
|
logger.info(f"Source URL: {model_source.url}")
|
|
logger.info(f"Source model: {model_source.source_model}")
|
|
logger.info(f"Original author: {model_source.original_author}")
|
|
logger.info(f"Model name: {model_source.model_name}")
|
|
logger.info(f"Your HF username: {self.uploader.get_username()}")
|
|
logger.info(f"Working directory: {self.work_dir}")
|
|
|
|
def _print_completion_summary(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Print completion summary."""
|
|
successful_results = [r for r in results.values() if r.success]
|
|
|
|
if successful_results:
|
|
logger.info("Complete! Your quantised models are available at:")
|
|
logger.info(f" https://huggingface.co/{output_repo}")
|
|
logger.info("Model info:")
|
|
logger.info(f" - Source URL: {model_source.url}")
|
|
logger.info(f" - Original: {model_source.source_model}")
|
|
logger.info(
|
|
" - Method: "
|
|
f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
|
|
)
|
|
logger.info(f" - Quantised: {output_repo}")
|
|
|
|
for result in successful_results:
|
|
if result.file_size:
|
|
filename = (
|
|
f"{model_source.original_author}-{model_source.model_name}-"
|
|
f"{result.quantisation_type}.gguf"
|
|
)
|
|
logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})")
|
|
else:
|
|
logger.error(
|
|
"All quantisations failed - repository created with documentation "
|
|
"but no model files"
|
|
)
|
|
logger.error(f" Repository: https://huggingface.co/{output_repo}")
|