457 lines
16 KiB
Python
457 lines
16 KiB
Python
"""Quantisation execution management.
|
|
|
|
Handles the execution of quantisation operations including parallel
|
|
uploads, status tracking, and error handling.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import gc
|
|
import traceback
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
|
|
from helpers.logger import logger
|
|
from helpers.models.quantisation import (
|
|
QuantisationContext,
|
|
QuantisationResult,
|
|
QuantisationType,
|
|
)
|
|
from helpers.quantisation.progress import ProgressReporter
|
|
from helpers.utils.rate_limiter import ReadmeRateLimiter
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
from helpers.filesystem import FileCleanup
|
|
from helpers.huggingface import HuggingFaceUploader
|
|
from helpers.models.quantisation import ModelSource
|
|
from helpers.quantisation.engine import QuantisationEngine
|
|
from helpers.readme import ReadmeGenerator
|
|
|
|
|
|
class QuantisationExecutor:
|
|
"""Executes quantisation operations with parallel upload support.
|
|
|
|
Manages the execution of multiple quantisations with background
|
|
uploads, status tracking, and proper error handling.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
quantisation_engine: QuantisationEngine,
|
|
uploader: HuggingFaceUploader,
|
|
readme_generator: ReadmeGenerator,
|
|
file_cleanup: FileCleanup,
|
|
no_upload: bool = False,
|
|
) -> None:
|
|
"""Initialise quantisation executor.
|
|
|
|
Sets up the quantisation executor with all required service dependencies
|
|
for performing quantisations, uploading results, generating documentation,
|
|
and cleaning up temporary files. Configures upload behaviour based on settings.
|
|
"""
|
|
self.quantisation_engine = quantisation_engine
|
|
self.uploader = uploader
|
|
self.readme_generator = readme_generator
|
|
self.file_cleanup = file_cleanup
|
|
self.no_upload = no_upload
|
|
self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0)
|
|
self.progress_reporter = ProgressReporter()
|
|
|
|
def execute_quantisations(
|
|
self,
|
|
model_source: ModelSource,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
quantisation_types: list[QuantisationType],
|
|
models_dir: Path,
|
|
) -> dict[QuantisationType, QuantisationResult]:
|
|
"""Execute all quantisation types with parallel uploads.
|
|
|
|
Orchestrates the complete quantisation workflow including F16 processing,
|
|
multiple quantisation type execution, parallel upload management, and
|
|
README generation. Handles all aspects of the quantisation pipeline
|
|
from initial setup through final documentation.
|
|
|
|
Returns:
|
|
Dictionary of quantisation results by type.
|
|
"""
|
|
results: dict[QuantisationType, QuantisationResult] = {}
|
|
|
|
# Track F16 in results if we converted from SafeTensors
|
|
if not model_source.is_gguf_repo:
|
|
results[QuantisationType.F16] = self._create_f16_result(f16_model_path)
|
|
|
|
# Process with parallel uploads
|
|
upload_futures: list[Any] = []
|
|
|
|
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
|
|
# Start F16 upload if applicable
|
|
if (
|
|
not model_source.is_gguf_repo
|
|
and not self.no_upload
|
|
and QuantisationType.F16 in results
|
|
):
|
|
self._start_f16_upload(
|
|
results,
|
|
model_source,
|
|
output_repo,
|
|
f16_model_path,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
|
|
# Process each quantisation
|
|
for i, quant_type in enumerate(quantisation_types, 1):
|
|
# Skip if already marked as failed
|
|
if quant_type in results and results[quant_type].status == "failed":
|
|
logger.info(
|
|
f"Skipping {quant_type.value} - {results[quant_type].error_message}"
|
|
)
|
|
continue
|
|
|
|
self.progress_reporter.print_quantisation_start(
|
|
i, len(quantisation_types), quant_type.value
|
|
)
|
|
|
|
try:
|
|
result = self._process_single_quantisation(
|
|
quant_type,
|
|
model_source,
|
|
f16_model_path,
|
|
imatrix_path,
|
|
output_repo,
|
|
results,
|
|
models_dir,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
results[quant_type] = result
|
|
|
|
# Force cleanup between quantisations
|
|
gc.collect()
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
|
|
logger.error("Exception traceback:")
|
|
for line in traceback.format_exc().splitlines():
|
|
logger.error(f" {line}")
|
|
|
|
results[quant_type] = QuantisationResult(
|
|
quantisation_type=quant_type,
|
|
success=False,
|
|
status="failed",
|
|
error_message=str(e),
|
|
)
|
|
|
|
# Force cleanup after error
|
|
gc.collect()
|
|
|
|
# Wait for all uploads to complete
|
|
self._wait_for_uploads(upload_futures)
|
|
|
|
# Final README update
|
|
if not self.no_upload and upload_futures:
|
|
self._final_readme_update(model_source, results, models_dir, output_repo)
|
|
|
|
return results
|
|
|
|
def _process_single_quantisation(
|
|
self,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
f16_model_path: Path,
|
|
imatrix_path: Path | None,
|
|
output_repo: str,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> QuantisationResult:
|
|
"""Process a single quantisation type.
|
|
|
|
Returns:
|
|
QuantisationResult for the processed type.
|
|
"""
|
|
try:
|
|
logger.info(f"Starting {quant_type.value} quantisation...")
|
|
config = QUANTISATION_CONFIGS[quant_type]
|
|
|
|
# Create initial result and update status
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "processing"
|
|
results[quant_type] = result
|
|
|
|
self._update_readme_status(model_source, results, models_dir, output_repo)
|
|
|
|
# Perform quantisation
|
|
context = QuantisationContext(
|
|
f16_model_path=f16_model_path,
|
|
model_source=model_source,
|
|
config=config,
|
|
models_dir=models_dir,
|
|
imatrix_path=imatrix_path,
|
|
)
|
|
result = self.quantisation_engine.quantise(context)
|
|
|
|
# Handle result
|
|
if result.success and result.file_path:
|
|
self._start_parallel_upload(
|
|
result,
|
|
quant_type,
|
|
output_repo,
|
|
model_source,
|
|
results,
|
|
models_dir,
|
|
upload_executor,
|
|
upload_futures,
|
|
)
|
|
else:
|
|
result.status = "failed"
|
|
self._update_readme_status(model_source, results, models_dir, output_repo)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing {quant_type.value}: {e}")
|
|
result = QuantisationResult(quantisation_type=quant_type, success=False)
|
|
result.status = "failed"
|
|
result.error_message = str(e)
|
|
|
|
try:
|
|
self._update_readme_status(model_source, results, models_dir, output_repo)
|
|
except Exception as readme_error:
|
|
logger.error(f"Failed to update README after error: {readme_error}")
|
|
|
|
return result
|
|
|
|
def _start_parallel_upload(
|
|
self,
|
|
result: QuantisationResult,
|
|
quant_type: QuantisationType,
|
|
output_repo: str,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> None:
|
|
"""Start parallel upload of quantisation result."""
|
|
if self.no_upload or not result.file_path:
|
|
return
|
|
|
|
quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
|
|
logger.info(f"Starting parallel upload of {quant_str}...")
|
|
|
|
upload_future = upload_executor.submit(
|
|
self._upload_and_cleanup,
|
|
output_repo,
|
|
result.file_path,
|
|
quant_type,
|
|
model_source,
|
|
results,
|
|
models_dir,
|
|
)
|
|
upload_futures.append(upload_future)
|
|
|
|
result.file_path = None # Mark as being uploaded
|
|
result.status = "uploading"
|
|
self._update_readme_status(model_source, results, models_dir, output_repo)
|
|
|
|
def _upload_and_cleanup(
|
|
self,
|
|
output_repo: str,
|
|
file_path: Path,
|
|
quant_type: QuantisationType,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
) -> None:
|
|
"""Upload file and clean up (runs in background thread)."""
|
|
try:
|
|
logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
|
|
self.uploader.upload_model_file(output_repo, file_path)
|
|
logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
|
|
|
|
self.file_cleanup.cleanup_quantisation_file(file_path)
|
|
|
|
results[quant_type].status = "completed"
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
|
|
|
|
except Exception as e:
|
|
logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
|
|
results[quant_type].status = "failed"
|
|
results[quant_type].error_message = str(e)
|
|
|
|
try:
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
except Exception as readme_error:
|
|
logger.error(
|
|
f"[PARALLEL] Failed to update README after upload error: {readme_error}"
|
|
)
|
|
|
|
def _start_f16_upload(
|
|
self,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
model_source: ModelSource,
|
|
output_repo: str,
|
|
f16_model_path: Path,
|
|
upload_executor: ThreadPoolExecutor,
|
|
upload_futures: list,
|
|
) -> None:
|
|
"""Start F16 upload in background."""
|
|
f16_result = results[QuantisationType.F16]
|
|
if f16_result.file_path and f16_result.file_path.exists():
|
|
logger.info("Starting parallel upload of F16 GGUF...")
|
|
f16_result.status = "uploading"
|
|
self._update_readme_status(
|
|
model_source, results, f16_model_path.parent.parent, output_repo
|
|
)
|
|
|
|
upload_future = upload_executor.submit(
|
|
self._upload_f16_and_cleanup,
|
|
output_repo,
|
|
f16_result.file_path,
|
|
model_source,
|
|
results,
|
|
f16_model_path.parent.parent,
|
|
)
|
|
upload_futures.append(upload_future)
|
|
|
|
def _upload_f16_and_cleanup(
|
|
self,
|
|
output_repo: str,
|
|
file_path: Path,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
) -> None:
|
|
"""Upload F16 file and update status (runs in background thread)."""
|
|
try:
|
|
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
|
|
self.uploader.upload_model_file(output_repo, file_path)
|
|
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
|
|
|
|
# Don't delete F16 yet - still needed for quantisations
|
|
|
|
results[QuantisationType.F16].status = "completed"
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
logger.info("[PARALLEL] F16 upload complete")
|
|
|
|
except Exception as e:
|
|
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
|
|
results[QuantisationType.F16].status = "failed"
|
|
results[QuantisationType.F16].error_message = str(e)
|
|
|
|
try:
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
except Exception as readme_error:
|
|
logger.error(
|
|
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
|
|
)
|
|
|
|
def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult:
|
|
"""Create a result object for F16 tracking.
|
|
|
|
Returns:
|
|
QuantisationResult object for F16 tracking.
|
|
"""
|
|
f16_size = "-"
|
|
if f16_model_path.exists():
|
|
size_bytes = f16_model_path.stat().st_size
|
|
size_gb = size_bytes / (1024**3)
|
|
f16_size = f"{size_gb:.1f}GB"
|
|
|
|
# Create a simple result object for F16 tracking
|
|
return type(
|
|
"F16Result",
|
|
(),
|
|
{
|
|
"quantisation_type": "F16",
|
|
"success": True,
|
|
"status": "planned",
|
|
"file_path": f16_model_path,
|
|
"file_size": f16_size,
|
|
},
|
|
)()
|
|
|
|
def _wait_for_uploads(self, upload_futures: list) -> None:
|
|
"""Wait for all parallel uploads to complete."""
|
|
if not upload_futures:
|
|
return
|
|
|
|
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
|
|
completed = 0
|
|
failed = 0
|
|
|
|
for future in upload_futures:
|
|
try:
|
|
future.result(timeout=300) # 5 minute timeout per upload
|
|
completed += 1
|
|
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
|
|
except Exception as e:
|
|
failed += 1
|
|
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
|
|
|
|
self.progress_reporter.print_upload_summary(completed, failed)
|
|
|
|
def _update_readme_status(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Update README with current quantisation status using rate limiting."""
|
|
if not self.no_upload:
|
|
# Use rate limiter to batch updates
|
|
self.readme_limiter.request_update(
|
|
self._do_readme_update,
|
|
model_source,
|
|
results,
|
|
models_dir,
|
|
output_repo,
|
|
)
|
|
|
|
def _do_readme_update(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Actually perform the README update (called by rate limiter)."""
|
|
updated_readme_path = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, updated_readme_path)
|
|
|
|
def _final_readme_update(
|
|
self,
|
|
model_source: ModelSource,
|
|
results: dict[QuantisationType, QuantisationResult],
|
|
models_dir: Path,
|
|
output_repo: str,
|
|
) -> None:
|
|
"""Perform final README update after all operations."""
|
|
logger.info("Updating README with final status...")
|
|
final_readme = self.readme_generator.generate(
|
|
model_source, results, models_dir, output_repo
|
|
)
|
|
self.uploader.upload_readme(output_repo, final_readme)
|