Switch to llama-cpp-python

This commit is contained in:
Tom Foster 2025-08-08 21:40:15 +01:00
parent ef7df1a8c3
commit d937f2d5fa
25 changed files with 2957 additions and 1181 deletions

View file

@ -1,10 +1,10 @@
#!/usr/bin/env python3
"""Bartowski Quantisation Script for advanced GGUF model processing.
"""Advanced Quantisation Script for GGUF model processing.
Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L,
Q4_K_XL, and Q4_K_XXL methods with tensor-level precision control. Features
parallel processing, status tracking, automatic README generation, and
HuggingFace integration for streamlined model distribution workflows.
Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L, Q4_K_XL and custom
profiles with tensor-level precision control. Features parallel processing, status tracking,
automatic README generation, and HuggingFace integration for streamlined model distribution
workflows.
Usage: python quantise.py <huggingface_url>
"""
@ -28,45 +28,38 @@ def main() -> None:
to quantised GGUF files with optional HuggingFace upload and cleanup.
"""
parser = argparse.ArgumentParser(
description="Bartowski Quantisation Script - Supports Q4_K_M, Q4_K_L, Q4_K_XL, Q4_K_XXL",
description=(
"GGUF model quantisation tool supporting Q2-Q8 formats including K-quants, "
"legacy formats, and Bartowski method variants with tensor-specific precision "
"for embeddings and output layers."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python quantise.py https://huggingface.co/DavidAU/Gemma-3-4b-it-Uncensored-DBL-X
python quantise.py hf.co/DavidAU/Gemma-3-it-4B-Uncensored-DBL-X-GGUF:F16
uv run quantise.py https://huggingface.co/MyUser/SafeTensorModelRepo
uv run quantise.py hf.co/MyUser/Model-Repo-GGUF:F16
""",
)
parser.add_argument("url", help="HuggingFace model URL")
parser.add_argument(
"--work-dir", type=Path, help="Working directory (default: ./quantisation_work)"
)
parser.add_argument("--work-dir", type=Path, help="Working directory (default: ./work)")
parser.add_argument(
"--no-imatrix",
action="store_true",
help="Skip imatrix generation (faster but lower quality)",
)
parser.add_argument(
"--imatrix-base",
choices=[
"Q2_K",
"Q3_K_L",
"Q3_K_M",
"Q3_K_S",
"Q4_K_S",
"Q4_K_M",
"Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
default="Q4_K_M",
help="Base quantisation for imatrix generation",
help="Skip checking for imatrix files (faster but lower quality)",
)
parser.add_argument(
"--no-upload",
action="store_true",
help="Skip uploading to HuggingFace (local testing only)",
)
parser.add_argument(
"--profiles",
nargs="*",
help=(
"Quantisation profiles to use "
"(default: Q3_K_M Q3_K_L Q3_K_XL Q4_K_M Q4_K_L Q5_K_M Q6_K Q6_K_L Q8_0)"
),
)
args = parser.parse_args()
@ -76,10 +69,10 @@ Examples:
try:
orchestrator = QuantisationOrchestrator(
work_dir=args.work_dir or Path.cwd() / "quantisation_work",
work_dir=args.work_dir or Path.cwd() / "work",
use_imatrix=not args.no_imatrix,
imatrix_base=args.imatrix_base,
no_upload=args.no_upload,
custom_profiles=args.profiles,
)
orchestrator.quantise(args.url)