llm-gguf-tools/quantise_gguf.py

#!/usr/bin/env python3
"""Advanced Quantisation Script for GGUF model processing.

Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L, Q4_K_XL and custom
profiles with tensor-level precision control. Features parallel processing, status tracking,
automatic README generation, and HuggingFace integration for streamlined model distribution
workflows.

Usage: python quantise.py <huggingface_url>
"""

from __future__ import annotations

import argparse
import shutil
import sys
from pathlib import Path

from helpers.logger import logger
from helpers.services.orchestrator import QuantisationOrchestrator


def main() -> None:
    """Main entry point for the Bartowski quantisation workflow.

    Parses command-line arguments, initialises the quantisation orchestrator,
    and executes the complete model processing pipeline from HuggingFace URL
    to quantised GGUF files with optional HuggingFace upload and cleanup.
    """
    parser = argparse.ArgumentParser(
        description=(
            "GGUF model quantisation tool supporting Q2-Q8 formats including K-quants, "
            "legacy formats, and Bartowski method variants with tensor-specific precision "
            "for embeddings and output layers."
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  uv run quantise.py https://huggingface.co/MyUser/SafeTensorModelRepo
  uv run quantise.py hf.co/MyUser/Model-Repo-GGUF:F16
        """,
    )
    parser.add_argument("url", help="HuggingFace model URL")
    parser.add_argument("--work-dir", type=Path, help="Working directory (default: ./work)")
    parser.add_argument(
        "--no-imatrix",
        action="store_true",
        help="Skip checking for imatrix files (faster but lower quality)",
    )
    parser.add_argument(
        "--no-upload",
        action="store_true",
        help="Skip uploading to HuggingFace (local testing only)",
    )
    parser.add_argument(
        "--profiles",
        nargs="*",
        help=(
            "Quantisation profiles to use "
            "(default: Q3_K_M Q3_K_L Q3_K_XL Q4_K_M Q4_K_L Q5_K_M Q6_K Q6_K_L Q8_0)"
        ),
    )

    args = parser.parse_args()

    if not args.url:
        parser.print_help()
        sys.exit(1)

    try:
        orchestrator = QuantisationOrchestrator(
            work_dir=args.work_dir or Path.cwd() / "work",
            use_imatrix=not args.no_imatrix,
            no_upload=args.no_upload,
            custom_profiles=args.profiles,
        )
        orchestrator.quantise(args.url)

        # Cleanup prompt
        logger.info("Cleaning up...")
        response = input("Delete working files? (y/N): ").strip().lower()
        if response == "y":
            shutil.rmtree(orchestrator.work_dir)
            logger.info("Cleanup complete")
        else:
            logger.info(f"Working files kept in: {orchestrator.work_dir}")

    except Exception as e:
        logger.error(f"Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()