llm-gguf-tools/quantise_gguf.py

94 lines
3 KiB
Python

#!/usr/bin/env python3
"""Advanced Quantisation Script for GGUF model processing.
Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L, Q4_K_XL and custom
profiles with tensor-level precision control. Features parallel processing, status tracking,
automatic README generation, and HuggingFace integration for streamlined model distribution
workflows.
Usage: python quantise.py <huggingface_url>
"""
from __future__ import annotations
import argparse
import shutil
import sys
from pathlib import Path
from helpers.logger import logger
from helpers.services.orchestrator import QuantisationOrchestrator
def main() -> None:
"""Main entry point for the Bartowski quantisation workflow.
Parses command-line arguments, initialises the quantisation orchestrator,
and executes the complete model processing pipeline from HuggingFace URL
to quantised GGUF files with optional HuggingFace upload and cleanup.
"""
parser = argparse.ArgumentParser(
description=(
"GGUF model quantisation tool supporting Q2-Q8 formats including K-quants, "
"legacy formats, and Bartowski method variants with tensor-specific precision "
"for embeddings and output layers."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
uv run quantise.py https://huggingface.co/MyUser/SafeTensorModelRepo
uv run quantise.py hf.co/MyUser/Model-Repo-GGUF:F16
""",
)
parser.add_argument("url", help="HuggingFace model URL")
parser.add_argument("--work-dir", type=Path, help="Working directory (default: ./work)")
parser.add_argument(
"--no-imatrix",
action="store_true",
help="Skip checking for imatrix files (faster but lower quality)",
)
parser.add_argument(
"--no-upload",
action="store_true",
help="Skip uploading to HuggingFace (local testing only)",
)
parser.add_argument(
"--profiles",
nargs="*",
help=(
"Quantisation profiles to use "
"(default: Q3_K_M Q3_K_L Q3_K_XL Q4_K_M Q4_K_L Q5_K_M Q6_K Q6_K_L Q8_0)"
),
)
args = parser.parse_args()
if not args.url:
parser.print_help()
sys.exit(1)
try:
orchestrator = QuantisationOrchestrator(
work_dir=args.work_dir or Path.cwd() / "work",
use_imatrix=not args.no_imatrix,
no_upload=args.no_upload,
custom_profiles=args.profiles,
)
orchestrator.quantise(args.url)
# Cleanup prompt
logger.info("Cleaning up...")
response = input("Delete working files? (y/N): ").strip().lower()
if response == "y":
shutil.rmtree(orchestrator.work_dir)
logger.info("Cleanup complete")
else:
logger.info(f"Working files kept in: {orchestrator.work_dir}")
except Exception as e:
logger.error(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()