llm-gguf-tools/helpers/quantisation/executor.py
2025-08-09 17:16:02 +01:00

457 lines
16 KiB
Python

"""Quantisation execution management.
Handles the execution of quantisation operations including parallel
uploads, status tracking, and error handling.
"""
from __future__ import annotations
import gc
import traceback
from concurrent.futures import ThreadPoolExecutor
from typing import TYPE_CHECKING, Any
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.logger import logger
from helpers.models.quantisation import (
QuantisationContext,
QuantisationResult,
QuantisationType,
)
from helpers.quantisation.progress import ProgressReporter
from helpers.utils.rate_limiter import ReadmeRateLimiter
if TYPE_CHECKING:
from pathlib import Path
from helpers.filesystem import FileCleanup
from helpers.huggingface import HuggingFaceUploader
from helpers.models.quantisation import ModelSource
from helpers.quantisation.engine import QuantisationEngine
from helpers.readme import ReadmeGenerator
class QuantisationExecutor:
"""Executes quantisation operations with parallel upload support.
Manages the execution of multiple quantisations with background
uploads, status tracking, and proper error handling.
"""
def __init__(
self,
quantisation_engine: QuantisationEngine,
uploader: HuggingFaceUploader,
readme_generator: ReadmeGenerator,
file_cleanup: FileCleanup,
no_upload: bool = False,
) -> None:
"""Initialise quantisation executor.
Sets up the quantisation executor with all required service dependencies
for performing quantisations, uploading results, generating documentation,
and cleaning up temporary files. Configures upload behaviour based on settings.
"""
self.quantisation_engine = quantisation_engine
self.uploader = uploader
self.readme_generator = readme_generator
self.file_cleanup = file_cleanup
self.no_upload = no_upload
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
self.progress_reporter = ProgressReporter()
def execute_quantisations(
self,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
quantisation_types: list[QuantisationType],
models_dir: Path,
) -> dict[QuantisationType, QuantisationResult]:
"""Execute all quantisation types with parallel uploads.
Orchestrates the complete quantisation workflow including F16 processing,
multiple quantisation type execution, parallel upload management, and
README generation. Handles all aspects of the quantisation pipeline
from initial setup through final documentation.
Returns:
Dictionary of quantisation results by type.
"""
results: dict[QuantisationType, QuantisationResult] = {}
# Track F16 in results if we converted from SafeTensors
if not model_source.is_gguf_repo:
results[QuantisationType.F16] = self._create_f16_result(f16_model_path)
# Process with parallel uploads
upload_futures: list[Any] = []
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
# Start F16 upload if applicable
if (
not model_source.is_gguf_repo
and not self.no_upload
and QuantisationType.F16 in results
):
self._start_f16_upload(
results,
model_source,
output_repo,
f16_model_path,
upload_executor,
upload_futures,
)
# Process each quantisation
for i, quant_type in enumerate(quantisation_types, 1):
# Skip if already marked as failed
if quant_type in results and results[quant_type].status == "failed":
logger.info(
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
)
continue
self.progress_reporter.print_quantisation_start(
i, len(quantisation_types), quant_type.value
)
try:
result = self._process_single_quantisation(
quant_type,
model_source,
f16_model_path,
imatrix_path,
output_repo,
results,
models_dir,
upload_executor,
upload_futures,
)
results[quant_type] = result
# Force cleanup between quantisations
gc.collect()
except Exception as e:
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
logger.error("Exception traceback:")
for line in traceback.format_exc().splitlines():
logger.error(f" {line}")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message=str(e),
)
# Force cleanup after error
gc.collect()
# Wait for all uploads to complete
self._wait_for_uploads(upload_futures)
# Final README update
if not self.no_upload and upload_futures:
self._final_readme_update(model_source, results, models_dir, output_repo)
return results
def _process_single_quantisation(
self,
quant_type: QuantisationType,
model_source: ModelSource,
f16_model_path: Path,
imatrix_path: Path | None,
output_repo: str,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> QuantisationResult:
"""Process a single quantisation type.
Returns:
QuantisationResult for the processed type.
"""
try:
logger.info(f"Starting {quant_type.value} quantisation...")
config = QUANTISATION_CONFIGS[quant_type]
# Create initial result and update status
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "processing"
results[quant_type] = result
self._update_readme_status(model_source, results, models_dir, output_repo)
# Perform quantisation
context = QuantisationContext(
f16_model_path=f16_model_path,
model_source=model_source,
config=config,
models_dir=models_dir,
imatrix_path=imatrix_path,
)
result = self.quantisation_engine.quantise(context)
# Handle result
if result.success and result.file_path:
self._start_parallel_upload(
result,
quant_type,
output_repo,
model_source,
results,
models_dir,
upload_executor,
upload_futures,
)
else:
result.status = "failed"
self._update_readme_status(model_source, results, models_dir, output_repo)
except Exception as e:
logger.error(f"Error processing {quant_type.value}: {e}")
result = QuantisationResult(quantisation_type=quant_type, success=False)
result.status = "failed"
result.error_message = str(e)
try:
self._update_readme_status(model_source, results, models_dir, output_repo)
except Exception as readme_error:
logger.error(f"Failed to update README after error: {readme_error}")
return result
def _start_parallel_upload(
self,
result: QuantisationResult,
quant_type: QuantisationType,
output_repo: str,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Start parallel upload of quantisation result."""
if self.no_upload or not result.file_path:
return
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
logger.info(f"Starting parallel upload of {quant_str}...")
upload_future = upload_executor.submit(
self._upload_and_cleanup,
output_repo,
result.file_path,
quant_type,
model_source,
results,
models_dir,
)
upload_futures.append(upload_future)
result.file_path = None # Mark as being uploaded
result.status = "uploading"
self._update_readme_status(model_source, results, models_dir, output_repo)
def _upload_and_cleanup(
self,
output_repo: str,
file_path: Path,
quant_type: QuantisationType,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
) -> None:
"""Upload file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
self.file_cleanup.cleanup_quantisation_file(file_path)
results[quant_type].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
results[quant_type].status = "failed"
results[quant_type].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
)
def _start_f16_upload(
self,
results: dict[QuantisationType, QuantisationResult],
model_source: ModelSource,
output_repo: str,
f16_model_path: Path,
upload_executor: ThreadPoolExecutor,
upload_futures: list,
) -> None:
"""Start F16 upload in background."""
f16_result = results[QuantisationType.F16]
if f16_result.file_path and f16_result.file_path.exists():
logger.info("Starting parallel upload of F16 GGUF...")
f16_result.status = "uploading"
self._update_readme_status(
model_source, results, f16_model_path.parent.parent, output_repo
)
upload_future = upload_executor.submit(
self._upload_f16_and_cleanup,
output_repo,
f16_result.file_path,
model_source,
results,
f16_model_path.parent.parent,
)
upload_futures.append(upload_future)
def _upload_f16_and_cleanup(
self,
output_repo: str,
file_path: Path,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
) -> None:
"""Upload F16 file and update status (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
# Don't delete F16 yet - still needed for quantisations
results[QuantisationType.F16].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info("[PARALLEL] F16 upload complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
results[QuantisationType.F16].status = "failed"
results[QuantisationType.F16].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
)
def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult:
"""Create a result object for F16 tracking.
Returns:
QuantisationResult object for F16 tracking.
"""
f16_size = "-"
if f16_model_path.exists():
size_bytes = f16_model_path.stat().st_size
size_gb = size_bytes / (1024**3)
f16_size = f"{size_gb:.1f}GB"
# Create a simple result object for F16 tracking
return type(
"F16Result",
(),
{
"quantisation_type": "F16",
"success": True,
"status": "planned",
"file_path": f16_model_path,
"file_size": f16_size,
},
)()
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
if not upload_futures:
return
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
completed = 0
failed = 0
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
completed += 1
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
except Exception as e:
failed += 1
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
self.progress_reporter.print_upload_summary(completed, failed)
def _update_readme_status(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Update README with current quantisation status using rate limiting."""
if not self.no_upload:
# Use rate limiter to batch updates
self.readme_limiter.request_update(
self._do_readme_update,
model_source,
results,
models_dir,
output_repo,
)
def _do_readme_update(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Actually perform the README update (called by rate limiter)."""
updated_readme_path = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
def _final_readme_update(
self,
model_source: ModelSource,
results: dict[QuantisationType, QuantisationResult],
models_dir: Path,
output_repo: str,
) -> None:
"""Perform final README update after all operations."""
logger.info("Updating README with final status...")
final_readme = self.readme_generator.generate(
model_source, results, models_dir, output_repo
)
self.uploader.upload_readme(output_repo, final_readme)