Switch to llama-cpp-python

2025-08-08 21:40:15 +01:00 · 2025-08-08 21:40:15 +01:00 · d937f2d5fa
commit d937f2d5fa
parent ef7df1a8c3
25 changed files with 2957 additions and 1181 deletions
--- a/quantise_gguf.py
+++ b/quantise_gguf.py
@ -1,10 +1,10 @@
 #!/usr/bin/env python3
-"""Bartowski Quantisation Script for advanced GGUF model processing.
+"""Advanced Quantisation Script for GGUF model processing.

-Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L,
-Q4_K_XL, and Q4_K_XXL methods with tensor-level precision control. Features
-parallel processing, status tracking, automatic README generation, and
-HuggingFace integration for streamlined model distribution workflows.
+Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L, Q4_K_XL and custom
+profiles with tensor-level precision control. Features parallel processing, status tracking,
+automatic README generation, and HuggingFace integration for streamlined model distribution
+workflows.

 Usage: python quantise.py <huggingface_url>
 """
@ -28,45 +28,38 @@ def main() -> None:
    to quantised GGUF files with optional HuggingFace upload and cleanup.
    """
    parser = argparse.ArgumentParser(
-        description="Bartowski Quantisation Script - Supports Q4_K_M, Q4_K_L, Q4_K_XL, Q4_K_XXL",
+        description=(
+            "GGUF model quantisation tool supporting Q2-Q8 formats including K-quants, "
+            "legacy formats, and Bartowski method variants with tensor-specific precision "
+            "for embeddings and output layers."
+        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  python quantise.py https://huggingface.co/DavidAU/Gemma-3-4b-it-Uncensored-DBL-X
-  python quantise.py hf.co/DavidAU/Gemma-3-it-4B-Uncensored-DBL-X-GGUF:F16
+  uv run quantise.py https://huggingface.co/MyUser/SafeTensorModelRepo
+  uv run quantise.py hf.co/MyUser/Model-Repo-GGUF:F16
        """,
    )
    parser.add_argument("url", help="HuggingFace model URL")
-    parser.add_argument(
-        "--work-dir", type=Path, help="Working directory (default: ./quantisation_work)"
-    )
+    parser.add_argument("--work-dir", type=Path, help="Working directory (default: ./work)")
    parser.add_argument(
        "--no-imatrix",
        action="store_true",
-        help="Skip imatrix generation (faster but lower quality)",
-    )
-    parser.add_argument(
-        "--imatrix-base",
-        choices=[
-            "Q2_K",
-            "Q3_K_L",
-            "Q3_K_M",
-            "Q3_K_S",
-            "Q4_K_S",
-            "Q4_K_M",
-            "Q5_K_S",
-            "Q5_K_M",
-            "Q6_K",
-            "Q8_0",
-        ],
-        default="Q4_K_M",
-        help="Base quantisation for imatrix generation",
+        help="Skip checking for imatrix files (faster but lower quality)",
    )
    parser.add_argument(
        "--no-upload",
        action="store_true",
        help="Skip uploading to HuggingFace (local testing only)",
    )
+    parser.add_argument(
+        "--profiles",
+        nargs="*",
+        help=(
+            "Quantisation profiles to use "
+            "(default: Q3_K_M Q3_K_L Q3_K_XL Q4_K_M Q4_K_L Q5_K_M Q6_K Q6_K_L Q8_0)"
+        ),
+    )

    args = parser.parse_args()

@ -76,10 +69,10 @@ Examples:

    try:
        orchestrator = QuantisationOrchestrator(
-            work_dir=args.work_dir or Path.cwd() / "quantisation_work",
+            work_dir=args.work_dir or Path.cwd() / "work",
            use_imatrix=not args.no_imatrix,
-            imatrix_base=args.imatrix_base,
            no_upload=args.no_upload,
+            custom_profiles=args.profiles,
        )
        orchestrator.quantise(args.url)