llm-gguf-tools/quantise_gguf.py

#!/usr/bin/env python3
"""Bartowski Quantisation Script for advanced GGUF model processing.

Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L,
Q4_K_XL, and Q4_K_XXL methods with tensor-level precision control. Features
parallel processing, status tracking, automatic README generation, and
HuggingFace integration for streamlined model distribution workflows.

Usage: python quantise.py <huggingface_url>
"""

from __future__ import annotations

import argparse
import shutil
import sys
from pathlib import Path

from helpers.logger import logger
from helpers.services.orchestrator import QuantisationOrchestrator


def main() -> None:
    """Main entry point for the Bartowski quantisation workflow.

    Parses command-line arguments, initialises the quantisation orchestrator,
    and executes the complete model processing pipeline from HuggingFace URL
    to quantised GGUF files with optional HuggingFace upload and cleanup.
    """
    parser = argparse.ArgumentParser(
        description="Bartowski Quantisation Script - Supports Q4_K_M, Q4_K_L, Q4_K_XL, Q4_K_XXL",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python quantise.py https://huggingface.co/DavidAU/Gemma-3-4b-it-Uncensored-DBL-X
  python quantise.py hf.co/DavidAU/Gemma-3-it-4B-Uncensored-DBL-X-GGUF:F16
        """,
    )
    parser.add_argument("url", help="HuggingFace model URL")
    parser.add_argument(
        "--work-dir", type=Path, help="Working directory (default: ./quantisation_work)"
    )
    parser.add_argument(
        "--no-imatrix",
        action="store_true",
        help="Skip imatrix generation (faster but lower quality)",
    )
    parser.add_argument(
        "--imatrix-base",
        choices=[
            "Q2_K",
            "Q3_K_L",
            "Q3_K_M",
            "Q3_K_S",
            "Q4_K_S",
            "Q4_K_M",
            "Q5_K_S",
            "Q5_K_M",
            "Q6_K",
            "Q8_0",
        ],
        default="Q4_K_M",
        help="Base quantisation for imatrix generation",
    )
    parser.add_argument(
        "--no-upload",
        action="store_true",
        help="Skip uploading to HuggingFace (local testing only)",
    )

    args = parser.parse_args()

    if not args.url:
        parser.print_help()
        sys.exit(1)

    try:
        orchestrator = QuantisationOrchestrator(
            work_dir=args.work_dir or Path.cwd() / "quantisation_work",
            use_imatrix=not args.no_imatrix,
            imatrix_base=args.imatrix_base,
            no_upload=args.no_upload,
        )
        orchestrator.quantise(args.url)

        # Cleanup prompt
        logger.info("Cleaning up...")
        response = input("Delete working files? (y/N): ").strip().lower()
        if response == "y":
            shutil.rmtree(orchestrator.work_dir)
            logger.info("Cleanup complete")
        else:
            logger.info(f"Working files kept in: {orchestrator.work_dir}")

    except Exception as e:
        logger.error(f"Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()