From d937f2d5fa6dbe7d1be34ebc083fc550525bfa8b457014d5f594b02e2f9b2a7b Mon Sep 17 00:00:00 2001 From: Tom Foster Date: Fri, 8 Aug 2025 21:40:15 +0100 Subject: [PATCH] Switch to llama-cpp-python --- .gitignore | 9 + README.md | 68 +-- docs/bartowski_analysis.md | 127 +++++ docs/development.md | 126 +++-- docs/imatrix_data.md | 115 ++++ docs/quantise_gguf.md | 193 ++++--- docs/safetensors2gguf.md | 272 ++++++--- helpers/config/__init__.py | 4 +- helpers/config/quantisation_configs.py | 247 +++++--- helpers/logger.py | 9 +- helpers/models/__init__.py | 30 - helpers/models/quantisation.py | 250 ++++++-- helpers/services/__init__.py | 14 - helpers/services/gguf.py | 34 +- helpers/services/huggingface.py | 251 ++++++-- helpers/services/llama_cpp.py | 402 ++----------- helpers/services/llama_python.py | 756 +++++++++++++++++++++++++ helpers/services/orchestrator.py | 335 +++++++++-- helpers/services/quantisation.py | 549 ++++++++++++------ helpers/utils/__init__.py | 11 - helpers/utils/config_parser.py | 26 +- helpers/utils/tensor_mapping.py | 32 +- pyproject.toml | 15 +- quantise_gguf.py | 55 +- uv.lock | 208 +++++-- 25 files changed, 2957 insertions(+), 1181 deletions(-) create mode 100644 docs/bartowski_analysis.md create mode 100644 docs/imatrix_data.md create mode 100644 helpers/services/llama_python.py diff --git a/.gitignore b/.gitignore index 641964d..933b4ec 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,9 @@ coverage.xml *.cover *.py,cover .hypothesis/ +.mypy_cache/ .pytest_cache/ +.ruff_cache/ cover/ # Environments @@ -49,3 +51,10 @@ venv/ ENV/ env.bak/ venv.bak/ + +# AI Clients +.claude/ + +# Working directories +work/ +quantisation_work/ diff --git a/README.md b/README.md index f83e970..c65c34a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,13 @@ # 🤖 LLM GGUF Tools -A collection of Python tools for converting and quantising language models to -[GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md), featuring advanced -quantisation methods and direct SafeTensors conversion capabilities. +Python tools for transforming language models into optimised +[GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) using proven quantisation +strategies. Based on analysis of community patterns, these tools replicate Bartowski's acclaimed +quantisation profiles whilst handling edge cases that break naive conversion approaches. + +The project bridges the gap between HuggingFace's SafeTensors ecosystem and llama.cpp's GGUF +inference engine, with particular focus on models that fall outside llama.cpp's supported +architecture list. > 💡 **Looking for quantised models?** Check out [tcpipuk's HuggingFace profile](https://huggingface.co/tcpipuk) > for models quantised using these tools! @@ -11,48 +16,45 @@ quantisation methods and direct SafeTensors conversion capabilities. | Tool | Purpose | Documentation | |------|---------|---------------| -| [quantise_gguf.py](./quantise_gguf.py) | ⚡ GGUF quantisation using a variant of [Bartowski's method](https://huggingface.co/bartowski) | [📖 Docs](docs/quantise_gguf.md) | -| [safetensors2gguf.py](./safetensors2gguf.py) | 🔄 Direct SafeTensors to GGUF conversion | [📖 Docs](docs/safetensors2gguf.md) | +| [quantise_gguf.py](./quantise_gguf.py) | Advanced GGUF quantisation with Bartowski's proven profiles (Q3_K-Q6_K variants) | [📖 Docs](docs/quantise_gguf.md) • [🔬 Analysis](docs/bartowski_analysis.md) | +| [safetensors2gguf.py](./safetensors2gguf.py) | Direct SafeTensors conversion for unsupported architectures | [📖 Docs](docs/safetensors2gguf.md) | -## Installation +## Quick Start -1. You need [`uv`](https://docs.astral.sh/uv/) for the dependencies: +The project uses [`uv`](https://docs.astral.sh/uv/) for Rust-fast dependency management with +automatic Python version handling: - ```bash - # Install uv (see https://docs.astral.sh/uv/#installation for more options) - curl -LsSf https://astral.sh/uv/install.sh | sh +```bash +# Install uv (or update existing: uv self update) +curl -LsSf https://astral.sh/uv/install.sh | sh - # Or update your existing instance - uv self update - ``` +# Clone and set up the project +git clone https://git.tomfos.tr/tom/llm-gguf-tools.git +cd llm-gguf-tools +uv sync # Installs llama-cpp-python with CUDA support if available -2. Then to set up the environment for these scripts: +# Generate HuggingFace token for uploads (optional) +# Visit https://huggingface.co/settings/tokens +export HF_TOKEN=your_token_here +``` - ```bash - # Clone the repository - git clone https://git.tomfos.tr/tom/llm-gguf-tools.git - cd llm-gguf-tools +Then quantise any HuggingFace model: - # Set up virtual environment and install dependencies - uv sync - ``` +```bash +# Fire-and-forget quantisation with automatic upload +uv run quantise_gguf.py https://huggingface.co/meta-llama/Llama-3.2-1B -## Requirements +# Or convert unsupported architectures directly +uv run safetensors2gguf.py ./path/to/model +``` -- **For quantisation**: [llama.cpp](https://github.com/ggerganov/llama.cpp) binaries - (`llama-quantize`, `llama-cli`, `llama-imatrix`) -- **For BFloat16 models**: PyTorch (optional, auto-detected) -- **For uploads**: HuggingFace API token (set `HF_TOKEN` environment variable) +For importance matrix (imatrix) data and calibration techniques, see the +[📖 IMatrix Data Guide](docs/imatrix_data.md). ## Development -For development setup and contribution guidelines, see [📖 Development Guide](docs/development.md). - -## Notes - -The `resources/imatrix_data.txt` file contains importance matrix calibration data from -[Bartowski's Gist](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8), -based on calibration data provided by Dampf, building upon Kalomaze's foundational work. +Contributions welcome for pragmatic solutions. See [📖 Development Guide](docs/development.md) +for setup, standards, and architectural decisions. ## License diff --git a/docs/bartowski_analysis.md b/docs/bartowski_analysis.md new file mode 100644 index 0000000..53dbd63 --- /dev/null +++ b/docs/bartowski_analysis.md @@ -0,0 +1,127 @@ +# Bartowski Quantisation Analysis + +Analysis of Bartowski GGUF files reveals why these models work so well: the "M" variants don't +apply uniform quantisation as their names suggest. + +1. [The Hidden Sophistication of M Variants](#the-hidden-sophistication-of-m-variants) +2. [The Complete Quantisation Map](#the-complete-quantisation-map) +3. [The Architecture of Intelligence](#the-architecture-of-intelligence) +4. [The Economics of Enhancement](#the-economics-of-enhancement) +5. [Why Q3\_K Gets Special Treatment](#why-q3_k-gets-special-treatment) +6. [Implementation Insights](#implementation-insights) +7. [The Deeper Pattern](#the-deeper-pattern) + +## The Hidden Sophistication of M Variants + +When creating a Q4_K_M model, llama.cpp doesn't apply Q4_K throughout. Instead, it strategically +enhances critical components – embeddings jump to Q6_K, attention V layers get Q6_K, and FFN down +projections receive the same treatment. This represents years of empirical optimisation baked +directly into the quantisation logic. + +The L and XL models make surgical adjustments to an already-optimised foundation. Q4_K_L simply +takes the enhanced Q4_K_M and upgrades embeddings from Q6_K to Q8_0. This explains why file size +increases are modest relative to quality gains. + +## The Complete Quantisation Map + +Here's what's actually happening inside these models, based on analysis of real GGUF files: + +| Variant | Embed | Output | Q | K | V | Gate | Up | Down | +|----------|-------|--------|-------|-------|-------|-------|-------|-------| +| Q3_K_M | Q6_K | Q4_K | Q3_K | Q3_K | Q5_K | Q3_K | Q3_K | Q5_K | +| Q3_K_L | Q6_K | Q5_K | Q3_K | Q3_K | Q5_K | Q3_K | Q3_K | Q5_K | +| Q3_K_XL | Q8_0 | Q5_K | Q3_K | Q3_K | Q5_K | Q3_K | Q3_K | Q5_K | +| Q4_K_M | Q6_K | Q4_K | Q4_K | Q4_K | Q6_K | Q4_K | Q4_K | Q6_K | +| Q4_K_L | Q8_0 | Q4_K | Q4_K | Q4_K | Q6_K | Q4_K | Q4_K | Q6_K | +| Q5_K_M | Q6_K | Q5_K | Q5_K | Q5_K | Q6_K | Q5_K | Q5_K | Q6_K | +| Q5_K_L | Q8_0 | Q5_K | Q5_K | Q5_K | Q6_K | Q5_K | Q5_K | Q6_K | +| Q6_K_L | Q8_0 | Q6_K | Q6_K | Q6_K | Q6_K | Q6_K | Q6_K | Q6_K | + +Key patterns: M variants boost embeddings to Q6_K, enhance attention V layers (Q3→Q5, Q4/Q5→Q6), +and upgrade FFN down projections. L variants change just embeddings or output. Only Q3_K has an XL +variant as it has room for both improvements without competing with the next tier. + +## The Architecture of Intelligence + +Using a Qwen3 4B model as reference: embeddings comprise just 9.7% of parameters (389M, 0.78GB at +F16) yet fundamentally determine vocabulary understanding. Poor embedding quantisation prevents the +model from distinguishing similar tokens. Upgrading from Q4 to Q8 adds only 0.17GB but dramatically +improves handling of technical terms and rare words. + +Attention (Q, K, V) accounts for 14.1% of parameters (566M, 1.13GB). Value vectors (V) are critical +– they're what the model retrieves when attending to context. M variants enhance V layers whilst +leaving Q and K at base quantisation for better information retrieval without excessive size increase. + +Feed-forward network trade-offs: Gate and up projections (44.6% of parameters, 1,793M, 3.59GB) +stay at base quantisation as enhancement would double file sizes for modest gains. Down projections +(22.3%, 897M, 1.79GB) get enhanced in M variants as they're the final transformation affecting all +downstream processing. + +The output layer (9.4% of parameters, 378M, 0.75GB) determines final token predictions. Q3_K_L +targets it for enhancement as improved output precision can mean the difference between coherent +and garbled text for Q3-based models. + +## The Economics of Enhancement + +Q4_K_M at 2.26GB already includes strategic Q6_K enhancements. The L variant adds just 0.44GB (19% +increase) by upgrading only embeddings to Q8_0, leveraging existing enhancements whilst maximising +vocabulary understanding. A naive approach of upgrading everything would add gigabytes for marginal +improvements. + +Bartowski's popularity stems from carefully chosen points in the size-quality space. Each variant +represents a local optimum – better quality requires jumping tiers, smaller size sacrifices key +capabilities. + +## Why Q3_K Gets Special Treatment + +Q3_K uniquely has an XL variant because it starts from the lowest practical quantisation with room +for improvement. The progression from Q3_K_M (1.5GB) through L (1.6GB) to XL (1.8GB) provides +granular control for memory-constrained environments, with each 15-20% size increase delivering +meaningful quality improvements. + +Q4_K_XL or Q5_K_XL don't exist because they'd compete with the next tier. A hypothetical Q4_K_XL +at 2.75GB would match Q5_K_M's size, but Q5_K_M's superior base quantisation provides better +quality than selectively enhanced Q4_K layers. + +The pattern is consistent: significant enhancements to Q5_K or Q6_K mean you should jump to the +next base type. Sweet spots: Q3 family for extreme memory constraints, Q4/Q5 for mainstream use, +Q6/Q8 when quality matters more than size. + +## Implementation Insights + +Since llama.cpp's M variants already include sophisticated enhancements, replicating Bartowski's +variants requires minimal configuration: + +```python +# Q3_K_L: Only upgrade output from M baseline +config = { + "base": "Q3_K_M", # Inherits Q6_K embeddings, Q5_K V/FFN-down + "output": "Q5_K" # Single surgical change +} + +# Q4_K_L: Only upgrade embeddings from M baseline +config = { + "base": "Q4_K_M", # Inherits Q6_K V/FFN-down + "embeddings": "Q8_0" # Single surgical change +} + +# Q3_K_XL: The only variant needing two changes +config = { + "base": "Q3_K_M", + "embeddings": "Q8_0", + "output": "Q5_K" +} +``` + +This minimalist approach recognises that M variants already embody years of community optimisation. +Bartowski's contribution lies in identifying where small adjustments yield outsized returns. + +## The Deeper Pattern + +This system evolved through countless experiments rather than top-down design. M variants encode +hard-won knowledge about critical layers. L variants build on this foundation. The absence of most +XL variants shows where diminishing returns set in. + +Bartowski's quantisations work because they embody years of collective learning about what matters +in practice. They demonstrate that the best solutions often come from understanding and building +upon what already works, rather than grand redesigns. diff --git a/docs/development.md b/docs/development.md index b6ad707..74431ca 100644 --- a/docs/development.md +++ b/docs/development.md @@ -1,86 +1,136 @@ # Development Guide -This guide covers development setup, code quality standards, and project structure for contributors. +Contributing to GGUF tools requires understanding quantisation workflows and Python's modern +dependency ecosystem. This guide covers setup, standards, and architectural decisions for fixing +bugs, adding quantisation profiles, or extending conversion capabilities. ## Code Quality +Ruff replaces the traditional Black/isort/flake8 stack as both linter and formatter. Mypy provides +static type checking to catch type-related bugs before runtime. Zero tolerance for linting and type +errors catches issues early. Both tools have extensive configuration in `pyproject.toml` to enforce +only the important code quality standards we've selected. Debug logging reveals quantisation internals +when models fail. + ```bash -# Run linting -uv run ruff check +# Run linting - catches style violations, potential bugs, and code smells +uvx ruff check -# Format code -uv run ruff format +# Format code - enforces consistent style automatically +uvx ruff format -# Run with debug logging +# Run type checking - ensures type safety and catches potential bugs +uv run mypy . + +# Run with debug logging - reveals conversion steps and tensor processing DEBUG=true uv run