From d937f2d5fa6dbe7d1be34ebc083fc550525bfa8b457014d5f594b02e2f9b2a7b Mon Sep 17 00:00:00 2001
From: Tom Foster <tom@tcpip.uk>
Date: Fri, 8 Aug 2025 21:40:15 +0100
Subject: [PATCH] Switch to llama-cpp-python

---
 .gitignore                             |   9 +
 README.md                              |  68 +--
 docs/bartowski_analysis.md             | 127 +++++
 docs/development.md                    | 126 +++--
 docs/imatrix_data.md                   | 115 ++++
 docs/quantise_gguf.md                  | 193 ++++---
 docs/safetensors2gguf.md               | 272 ++++++---
 helpers/config/__init__.py             |   4 +-
 helpers/config/quantisation_configs.py | 247 +++++---
 helpers/logger.py                      |   9 +-
 helpers/models/__init__.py             |  30 -
 helpers/models/quantisation.py         | 250 ++++++--
 helpers/services/__init__.py           |  14 -
 helpers/services/gguf.py               |  34 +-
 helpers/services/huggingface.py        | 251 ++++++--
 helpers/services/llama_cpp.py          | 402 ++-----------
 helpers/services/llama_python.py       | 756 +++++++++++++++++++++++++
 helpers/services/orchestrator.py       | 335 +++++++++--
 helpers/services/quantisation.py       | 549 ++++++++++++------
 helpers/utils/__init__.py              |  11 -
 helpers/utils/config_parser.py         |  26 +-
 helpers/utils/tensor_mapping.py        |  32 +-
 pyproject.toml                         |  15 +-
 quantise_gguf.py                       |  55 +-
 uv.lock                                | 208 +++++--
 25 files changed, 2957 insertions(+), 1181 deletions(-)
 create mode 100644 docs/bartowski_analysis.md
 create mode 100644 docs/imatrix_data.md
 create mode 100644 helpers/services/llama_python.py

diff --git a/.gitignore b/.gitignore
index 641964d..933b4ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,7 +38,9 @@ coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
+.mypy_cache/
 .pytest_cache/
+.ruff_cache/
 cover/
 
 # Environments
@@ -49,3 +51,10 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+
+# AI Clients
+.claude/
+
+# Working directories
+work/
+quantisation_work/
diff --git a/README.md b/README.md
index f83e970..c65c34a 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,13 @@
 # 🤖 LLM GGUF Tools
 
-A collection of Python tools for converting and quantising language models to
-[GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md), featuring advanced
-quantisation methods and direct SafeTensors conversion capabilities.
+Python tools for transforming language models into optimised
+[GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) using proven quantisation
+strategies. Based on analysis of community patterns, these tools replicate Bartowski's acclaimed
+quantisation profiles whilst handling edge cases that break naive conversion approaches.
+
+The project bridges the gap between HuggingFace's SafeTensors ecosystem and llama.cpp's GGUF
+inference engine, with particular focus on models that fall outside llama.cpp's supported
+architecture list.
 
 > 💡 **Looking for quantised models?** Check out [tcpipuk's HuggingFace profile](https://huggingface.co/tcpipuk)
 > for models quantised using these tools!
@@ -11,48 +16,45 @@ quantisation methods and direct SafeTensors conversion capabilities.
 
 | Tool | Purpose | Documentation |
 |------|---------|---------------|
-| [quantise_gguf.py](./quantise_gguf.py) | ⚡ GGUF quantisation using a variant of [Bartowski's method](https://huggingface.co/bartowski) | [📖 Docs](docs/quantise_gguf.md) |
-| [safetensors2gguf.py](./safetensors2gguf.py) | 🔄 Direct SafeTensors to GGUF conversion | [📖 Docs](docs/safetensors2gguf.md) |
+| [quantise_gguf.py](./quantise_gguf.py) | Advanced GGUF quantisation with Bartowski's proven profiles (Q3_K-Q6_K variants) | [📖 Docs](docs/quantise_gguf.md) • [🔬 Analysis](docs/bartowski_analysis.md) |
+| [safetensors2gguf.py](./safetensors2gguf.py) | Direct SafeTensors conversion for unsupported architectures | [📖 Docs](docs/safetensors2gguf.md) |
 
-## Installation
+## Quick Start
 
-1. You need [`uv`](https://docs.astral.sh/uv/) for the dependencies:
+The project uses [`uv`](https://docs.astral.sh/uv/) for Rust-fast dependency management with
+automatic Python version handling:
 
-   ```bash
-   # Install uv (see https://docs.astral.sh/uv/#installation for more options)
-   curl -LsSf https://astral.sh/uv/install.sh | sh
+```bash
+# Install uv (or update existing: uv self update)
+curl -LsSf https://astral.sh/uv/install.sh | sh
 
-   # Or update your existing instance
-   uv self update
-   ```
+# Clone and set up the project
+git clone https://git.tomfos.tr/tom/llm-gguf-tools.git
+cd llm-gguf-tools
+uv sync  # Installs llama-cpp-python with CUDA support if available
 
-2. Then to set up the environment for these scripts:
+# Generate HuggingFace token for uploads (optional)
+# Visit https://huggingface.co/settings/tokens
+export HF_TOKEN=your_token_here
+```
 
-   ```bash
-   # Clone the repository
-   git clone https://git.tomfos.tr/tom/llm-gguf-tools.git
-   cd llm-gguf-tools
+Then quantise any HuggingFace model:
 
-   # Set up virtual environment and install dependencies
-   uv sync
-   ```
+```bash
+# Fire-and-forget quantisation with automatic upload
+uv run quantise_gguf.py https://huggingface.co/meta-llama/Llama-3.2-1B
 
-## Requirements
+# Or convert unsupported architectures directly
+uv run safetensors2gguf.py ./path/to/model
+```
 
-- **For quantisation**: [llama.cpp](https://github.com/ggerganov/llama.cpp) binaries
-  (`llama-quantize`, `llama-cli`, `llama-imatrix`)
-- **For BFloat16 models**: PyTorch (optional, auto-detected)
-- **For uploads**: HuggingFace API token (set `HF_TOKEN` environment variable)
+For importance matrix (imatrix) data and calibration techniques, see the
+[📖 IMatrix Data Guide](docs/imatrix_data.md).
 
 ## Development
 
-For development setup and contribution guidelines, see [📖 Development Guide](docs/development.md).
-
-## Notes
-
-The `resources/imatrix_data.txt` file contains importance matrix calibration data from
-[Bartowski's Gist](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8),
-based on calibration data provided by Dampf, building upon Kalomaze's foundational work.
+Contributions welcome for pragmatic solutions. See [📖 Development Guide](docs/development.md)
+for setup, standards, and architectural decisions.
 
 ## License
 
diff --git a/docs/bartowski_analysis.md b/docs/bartowski_analysis.md
new file mode 100644
index 0000000..53dbd63
--- /dev/null
+++ b/docs/bartowski_analysis.md
@@ -0,0 +1,127 @@
+# Bartowski Quantisation Analysis
+
+Analysis of Bartowski GGUF files reveals why these models work so well: the "M" variants don't
+apply uniform quantisation as their names suggest.
+
+1. [The Hidden Sophistication of M Variants](#the-hidden-sophistication-of-m-variants)
+2. [The Complete Quantisation Map](#the-complete-quantisation-map)
+3. [The Architecture of Intelligence](#the-architecture-of-intelligence)
+4. [The Economics of Enhancement](#the-economics-of-enhancement)
+5. [Why Q3\_K Gets Special Treatment](#why-q3_k-gets-special-treatment)
+6. [Implementation Insights](#implementation-insights)
+7. [The Deeper Pattern](#the-deeper-pattern)
+
+## The Hidden Sophistication of M Variants
+
+When creating a Q4_K_M model, llama.cpp doesn't apply Q4_K throughout. Instead, it strategically
+enhances critical components – embeddings jump to Q6_K, attention V layers get Q6_K, and FFN down
+projections receive the same treatment. This represents years of empirical optimisation baked
+directly into the quantisation logic.
+
+The L and XL models make surgical adjustments to an already-optimised foundation. Q4_K_L simply
+takes the enhanced Q4_K_M and upgrades embeddings from Q6_K to Q8_0. This explains why file size
+increases are modest relative to quality gains.
+
+## The Complete Quantisation Map
+
+Here's what's actually happening inside these models, based on analysis of real GGUF files:
+
+| Variant  | Embed | Output | Q     | K     | V     | Gate  | Up    | Down  |
+|----------|-------|--------|-------|-------|-------|-------|-------|-------|
+| Q3_K_M   | Q6_K  | Q4_K   | Q3_K  | Q3_K  | Q5_K  | Q3_K  | Q3_K  | Q5_K  |
+| Q3_K_L   | Q6_K  | Q5_K   | Q3_K  | Q3_K  | Q5_K  | Q3_K  | Q3_K  | Q5_K  |
+| Q3_K_XL  | Q8_0  | Q5_K   | Q3_K  | Q3_K  | Q5_K  | Q3_K  | Q3_K  | Q5_K  |
+| Q4_K_M   | Q6_K  | Q4_K   | Q4_K  | Q4_K  | Q6_K  | Q4_K  | Q4_K  | Q6_K  |
+| Q4_K_L   | Q8_0  | Q4_K   | Q4_K  | Q4_K  | Q6_K  | Q4_K  | Q4_K  | Q6_K  |
+| Q5_K_M   | Q6_K  | Q5_K   | Q5_K  | Q5_K  | Q6_K  | Q5_K  | Q5_K  | Q6_K  |
+| Q5_K_L   | Q8_0  | Q5_K   | Q5_K  | Q5_K  | Q6_K  | Q5_K  | Q5_K  | Q6_K  |
+| Q6_K_L   | Q8_0  | Q6_K   | Q6_K  | Q6_K  | Q6_K  | Q6_K  | Q6_K  | Q6_K  |
+
+Key patterns: M variants boost embeddings to Q6_K, enhance attention V layers (Q3→Q5, Q4/Q5→Q6),
+and upgrade FFN down projections. L variants change just embeddings or output. Only Q3_K has an XL
+variant as it has room for both improvements without competing with the next tier.
+
+## The Architecture of Intelligence
+
+Using a Qwen3 4B model as reference: embeddings comprise just 9.7% of parameters (389M, 0.78GB at
+F16) yet fundamentally determine vocabulary understanding. Poor embedding quantisation prevents the
+model from distinguishing similar tokens. Upgrading from Q4 to Q8 adds only 0.17GB but dramatically
+improves handling of technical terms and rare words.
+
+Attention (Q, K, V) accounts for 14.1% of parameters (566M, 1.13GB). Value vectors (V) are critical
+– they're what the model retrieves when attending to context. M variants enhance V layers whilst
+leaving Q and K at base quantisation for better information retrieval without excessive size increase.
+
+Feed-forward network trade-offs: Gate and up projections (44.6% of parameters, 1,793M, 3.59GB)
+stay at base quantisation as enhancement would double file sizes for modest gains. Down projections
+(22.3%, 897M, 1.79GB) get enhanced in M variants as they're the final transformation affecting all
+downstream processing.
+
+The output layer (9.4% of parameters, 378M, 0.75GB) determines final token predictions. Q3_K_L
+targets it for enhancement as improved output precision can mean the difference between coherent
+and garbled text for Q3-based models.
+
+## The Economics of Enhancement
+
+Q4_K_M at 2.26GB already includes strategic Q6_K enhancements. The L variant adds just 0.44GB (19%
+increase) by upgrading only embeddings to Q8_0, leveraging existing enhancements whilst maximising
+vocabulary understanding. A naive approach of upgrading everything would add gigabytes for marginal
+improvements.
+
+Bartowski's popularity stems from carefully chosen points in the size-quality space. Each variant
+represents a local optimum – better quality requires jumping tiers, smaller size sacrifices key
+capabilities.
+
+## Why Q3_K Gets Special Treatment
+
+Q3_K uniquely has an XL variant because it starts from the lowest practical quantisation with room
+for improvement. The progression from Q3_K_M (1.5GB) through L (1.6GB) to XL (1.8GB) provides
+granular control for memory-constrained environments, with each 15-20% size increase delivering
+meaningful quality improvements.
+
+Q4_K_XL or Q5_K_XL don't exist because they'd compete with the next tier. A hypothetical Q4_K_XL
+at 2.75GB would match Q5_K_M's size, but Q5_K_M's superior base quantisation provides better
+quality than selectively enhanced Q4_K layers.
+
+The pattern is consistent: significant enhancements to Q5_K or Q6_K mean you should jump to the
+next base type. Sweet spots: Q3 family for extreme memory constraints, Q4/Q5 for mainstream use,
+Q6/Q8 when quality matters more than size.
+
+## Implementation Insights
+
+Since llama.cpp's M variants already include sophisticated enhancements, replicating Bartowski's
+variants requires minimal configuration:
+
+```python
+# Q3_K_L: Only upgrade output from M baseline
+config = {
+    "base": "Q3_K_M",  # Inherits Q6_K embeddings, Q5_K V/FFN-down
+    "output": "Q5_K"   # Single surgical change
+}
+
+# Q4_K_L: Only upgrade embeddings from M baseline
+config = {
+    "base": "Q4_K_M",     # Inherits Q6_K V/FFN-down
+    "embeddings": "Q8_0"  # Single surgical change
+}
+
+# Q3_K_XL: The only variant needing two changes
+config = {
+    "base": "Q3_K_M",
+    "embeddings": "Q8_0",
+    "output": "Q5_K"
+}
+```
+
+This minimalist approach recognises that M variants already embody years of community optimisation.
+Bartowski's contribution lies in identifying where small adjustments yield outsized returns.
+
+## The Deeper Pattern
+
+This system evolved through countless experiments rather than top-down design. M variants encode
+hard-won knowledge about critical layers. L variants build on this foundation. The absence of most
+XL variants shows where diminishing returns set in.
+
+Bartowski's quantisations work because they embody years of collective learning about what matters
+in practice. They demonstrate that the best solutions often come from understanding and building
+upon what already works, rather than grand redesigns.
diff --git a/docs/development.md b/docs/development.md
index b6ad707..74431ca 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,86 +1,136 @@
 # Development Guide
 
-This guide covers development setup, code quality standards, and project structure for contributors.
+Contributing to GGUF tools requires understanding quantisation workflows and Python's modern
+dependency ecosystem. This guide covers setup, standards, and architectural decisions for fixing
+bugs, adding quantisation profiles, or extending conversion capabilities.
 
 ## Code Quality
 
+Ruff replaces the traditional Black/isort/flake8 stack as both linter and formatter. Mypy provides
+static type checking to catch type-related bugs before runtime. Zero tolerance for linting and type
+errors catches issues early. Both tools have extensive configuration in `pyproject.toml` to enforce
+only the important code quality standards we've selected. Debug logging reveals quantisation internals
+when models fail.
+
 ```bash
-# Run linting
-uv run ruff check
+# Run linting - catches style violations, potential bugs, and code smells
+uvx ruff check
 
-# Format code
-uv run ruff format
+# Format code - enforces consistent style automatically
+uvx ruff format
 
-# Run with debug logging
+# Run type checking - ensures type safety and catches potential bugs
+uv run mypy .
+
+# Run with debug logging - reveals conversion steps and tensor processing
 DEBUG=true uv run <script>
 ```
 
 ## Project Structure
 
+Architecture separates concerns cleanly: top-level scripts provide interfaces, helpers encapsulate
+reusable logic, resources contain community data. Structure evolved from practical needs – helpers
+emerged to eliminate duplication, services to abstract external dependencies.
+
 ```plain
 llm-gguf-tools/
-├── quantise.py                    # Bartowski quantisation tool
-├── direct_safetensors_to_gguf.py  # Direct conversion tool
-├── helpers/                       # Shared utilities
+├── quantise.py                    # Bartowski quantisation tool - the main workflow
+├── direct_safetensors_to_gguf.py  # Direct conversion for unsupported architectures
+├── helpers/                       # Shared utilities and abstractions
 │   ├── __init__.py
-│   └── logger.py                  # Colour-coded logging
-├── resources/                     # Resource files
-│   └── imatrix_data.txt          # Calibration data for imatrix
+│   ├── logger.py                  # Colour-coded logging with context awareness
+│   ├── services/                  # External service wrappers
+│   │   ├── gguf.py                # GGUF writer abstraction
+│   │   └── llama_python.py        # llama-cpp-python integration
+│   └── utils/                     # Pure utility functions
+│       ├── config_parser.py       # Model configuration handling
+│       └── tensor_mapping.py      # Architecture-specific tensor name mapping
+├── resources/                     # Resource files and calibration data
+│   └── imatrix_data.txt           # Curated calibration data from Bartowski
 ├── docs/                          # Detailed documentation
-│   ├── quantise.md
-│   ├── direct_safetensors_to_gguf.md
-│   └── development.md
-└── pyproject.toml                # Project configuration
+│   ├── quantise_gguf.md           # Quantisation strategies and profiles
+│   ├── safetensors2gguf.md        # Direct conversion documentation
+│   ├── bartowski_analysis.md      # Deep dive into variant strategies
+│   ├── imatrix_data.md            # Importance matrix guide
+│   └── development.md             # This guide
+└── pyproject.toml                 # Modern Python project configuration
 ```
 
 ## Contributing Guidelines
 
-Contributions are welcome! Please ensure:
+The project values pragmatic solutions over theoretical perfection – working code that handles edge
+cases beats elegant abstractions. Contributors should understand how quantisation profiles map to
+Bartowski's discoveries and where Python-C++ boundaries limit functionality.
 
-1. Code follows the existing style (run `uv run ruff format`)
-2. All functions have Google-style docstrings
-3. Type hints are used throughout
-4. Tests pass (if applicable)
+Essential requirements:
+
+1. **Style consistency**: Run `uvx ruff format` before committing to keep diffs focused on logic
+2. **Documentation**: Google-style docstrings explain behaviour and rationale beyond type hints
+3. **Type safety**: Complete type hints for all public functions enable IDE support
+4. **Practical testing**: Test with both 1B and 7B+ models to catch scaling issues
 
 ## Development Workflow
 
 ### Setting Up Development Environment
 
+The project uses `uv` for dependency management – Rust-fast, automatic Python version management,
+upfront dependency resolution. Development dependencies include ruff, type stubs, and optional
+PyTorch for BFloat16 handling.
+
 ```bash
-# Clone the repository
+# Clone the repository - uses Forgejo (GitLab-like) hosting
 git clone https://git.tomfos.tr/tom/llm-gguf-tools.git
 cd llm-gguf-tools
 
-# Install all dependencies including dev
+# Install all dependencies including dev tools
+# This installs llama-cpp-python with CUDA support if available
 uv sync --all-groups
 ```
 
 ### Code Style
 
-- Follow PEP 8 with ruff enforcement
-- Use UK English spelling in comments and documentation
-- Maximum line length: 100 characters
-- Use type hints for all function parameters and returns
+Code style reduces cognitive load by letting reviewers focus on logic rather than layout. UK English
+maintains llama.cpp consistency. The 100-character line limit balances descriptive names with
+readability.
+
+Core conventions:
+
+- **PEP 8 compliance**: Ruff catches mutable defaults, unused imports automatically
+- **UK English**: "Optimise" not "optimize", matching upstream llama.cpp
+- **Line length**: 100 characters maximum except URLs or unbreakable paths
+- **Type annotations**: Complete hints for public functions – documentation that can't go stale
+- **Import ordering**: Standard library, third-party, local – ruff handles automatically
 
 ### Testing
 
-While formal tests are not yet implemented, ensure:
+Formal tests pending. Quantisation "correctness" depends on complex interactions between model
+architecture, strategy, and downstream usage. Benchmark performance doesn't guarantee production
+success.
 
-- Scripts run without errors on sample models
-- Logger output is correctly formatted
-- File I/O operations handle errors gracefully
+Current validation approach:
+
+- **End-to-end testing**: Qwen 0.5B for quick iteration, Llama 3.2 1B for architecture compatibility
+- **Output validation**: GGUF must load in llama.cpp and degrade gracefully, not produce gibberish
+- **Error handling**: Test corrupted SafeTensors, missing configs, insufficient disk space
+- **Logger consistency**: Verify colour coding across terminals, progress bars with piped output
 
 ### Debugging
 
-Enable debug logging for verbose output:
+Debug logging transforms black box to glass box, revealing failure points. Colour coding highlights
+stages: blue (info), yellow (warnings), red (errors), green (success). Visual hierarchy enables
+efficient log scanning.
 
 ```bash
-DEBUG=true uv run quantise.py <model_url>
+# Enable comprehensive debug output
+DEBUG=true uv run direct_safetensors_to_gguf.py ./model  # Tensor mapping details
+DEBUG=true uv run quantise.py <model_url>                # Memory usage tracking
 ```
 
-This will show additional information about:
+Debug output reveals:
 
-- Model download progress
-- Conversion steps
-- File operations
-- Error details
+- **Download progress**: Bytes transferred, retries, connection issues
+- **Conversion pipeline**: SafeTensors→GGUF steps, tensor mappings, dimension changes
+- **Quantisation decisions**: Layer bit depths, importance matrix effects on weight selection
+- **Memory usage**: Peak consumption for predicting larger model requirements
+- **File operations**: Read/write/temp patterns for disk usage analysis
+- **Error context**: Stack traces with local variables at failure points
diff --git a/docs/imatrix_data.md b/docs/imatrix_data.md
new file mode 100644
index 0000000..ab89aba
--- /dev/null
+++ b/docs/imatrix_data.md
@@ -0,0 +1,115 @@
+# Importance Matrix (IMatrix) Data Guide
+
+An importance matrix guides quantisation by identifying critical weights that need protection. Like
+JPEG compression preserving detail in faces whilst compressing uniform backgrounds, the imatrix
+protects parameters that most affect output quality.
+
+At lower bit rates, imatrix-quantised models show 2-3% better perplexity scores overall, with larger
+gains in specific capabilities. A Q3_K model without imatrix might lose technical vocabulary or
+rare language handling, whilst with imatrix it retains these abilities – the difference between
+simple size reduction and intelligent compression.
+
+1. [The Art of Calibration Data](#the-art-of-calibration-data)
+2. [Finding Pre-computed Matrices](#finding-pre-computed-matrices)
+3. [Creating Your Own Matrix](#creating-your-own-matrix)
+4. [Resource Requirements and Optimisation](#resource-requirements-and-optimisation)
+5. [Integration and Workflow](#integration-and-workflow)
+6. [Future Developments](#future-developments)
+7. [Practical Tips](#practical-tips)
+
+## The Art of Calibration Data
+
+This repository includes `resources/imatrix_data.txt` from
+[Bartowski's collection](https://gist.github.com/bartowski1182/eb213dccb3571f863da82e99418f81e8),
+originally compiled by Dampf building on Kalomaze's work. The dataset systematically activates
+different model capabilities: technical writing for analysis, creative fiction for narrative,
+multilingual text for language diversity, and factual content for knowledge accuracy.
+
+The default calibration data works well for general models, but specialised models benefit from
+targeted calibration. Code models need diverse programming languages and patterns; medical models
+need technical literature and terminology. Calibration should reflect actual use cases – 50-100KB
+of well-chosen text beats gigabytes of random content.
+
+Calibration runs text through the model to observe weight activation patterns. These patterns
+become the importance matrix – a heat map of crucial parameters for intended use cases, similar to
+how brains strengthen frequently-used neural pathways.
+
+## Finding Pre-computed Matrices
+
+Check for existing matrices before generating your own. Bartowski shares pre-computed matrices at
+`https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix`. These save
+hours of computation and provide excellent results from high-quality calibration data.
+
+The tool automatically checks for imatrix files. If missing, download the appropriate imatrix to
+your model's work directory as `imatrix.dat`. The quality improvement, especially at lower
+quantisation levels, justifies this extra step.
+
+## Creating Your Own Matrix
+
+Generate your own imatrix for new models, domain-specific calibration, or experimentation.
+Currently requires llama.cpp's binary tools as the functionality isn't exposed through
+llama-cpp-python.
+
+Download llama.cpp from the [official releases](https://github.com/ggerganov/llama.cpp/releases).
+Windows users need `llama-bXXXX-bin-win-cuda-x64.zip` for GPU support; Linux/macOS users can use
+binaries or compile from source.
+
+Use the F16 or F32 GGUF model (found in `./work/<model-name>/` after quantisation). F16 balances
+quality and computation requirements. Run from your llama.cpp directory:
+
+```bash
+./llama-imatrix -m /path/to/model-F16.gguf \
+                 -f /path/to/calibration.txt \
+                 -o /path/to/imatrix.dat \
+                 --chunks 100
+```
+
+Generation runs inference whilst analysing activation patterns. The `--chunks` parameter controls
+thoroughness (100 is standard, more for production, less for experiments). Expect 30 minutes to
+several hours on consumer hardware. GPU acceleration helps significantly.
+
+Generation shows perplexity calculations and progress updates after initial loading. The tool tracks
+activation patterns, calculates importance scores, and builds the statistical model for guiding
+quantisation.
+
+## Resource Requirements and Optimisation
+
+Resource requirements match full inference: 7B models need ~14GB RAM for F16. CPU-only works but
+GPU acceleration reduces days to hours for large models. The process supports interruption and
+resumption.
+
+Matrix quality depends on multiple factors. More chunks improve results with diminishing returns
+beyond 200-300. F16 precision is optimal – F32 doubles computation for minimal gain, whilst
+quantised models create quality-degrading feedback loops.
+
+Temperature affects generation (lower focuses on likely paths, higher explores possibilities) but
+defaults are well-tuned. Good calibration data matters more than parameter tweaking.
+
+## Integration and Workflow
+
+Place the imatrix as `imatrix.dat` in your model's work directory. The tool auto-detects and applies
+it with log confirmation. One imatrix works for all quantisation levels.
+
+The tool acknowledges current limitations whilst providing clean workflows. Though Python generation
+isn't available yet, using external matrices is trivial. This pragmatic approach delivers optimal
+results whilst preparing for future improvements.
+
+## Future Developments
+
+Native imatrix generation is on llama-cpp-python's roadmap for immediate integration when available.
+Meanwhile, this hybrid approach works well. The community shares matrices, calibration datasets
+improve constantly, and algorithms grow more sophisticated.
+
+Research continues into dynamic importance scoring, multi-modal calibration for vision-language
+models, and automated calibration generation. These advances will eventually reach production tools,
+but current approaches already deliver impressive results.
+
+## Practical Tips
+
+Key insights: Quality and diversity beat quantity in calibration data. Include specific use cases
+even if uncommon. Balance languages proportionally for multilingual models. Include edge cases for
+robustness. When in doubt, use Bartowski's pre-computed matrices – they're consistently excellent.
+
+The importance matrix seems obvious in hindsight – preserve critical weights, calibrate for actual
+usage. Yet it took years of experimentation to develop these techniques. Using them well transforms
+quantisation from simple size reduction to intelligent preservation of what matters.
diff --git a/docs/quantise_gguf.md b/docs/quantise_gguf.md
index f778a14..b05df84 100644
--- a/docs/quantise_gguf.md
+++ b/docs/quantise_gguf.md
@@ -1,102 +1,151 @@
-# quantise.py - Advanced GGUF Quantisation
+# quantise_gguf.py - Advanced GGUF Quantisation
 
-Advanced GGUF quantisation tool implementing Bartowski's sophisticated quantisation pipeline.
+Transforms language models into optimised GGUF formats, from aggressive Q2 compression to
+high-precision Q8_0. Based on analysis of community quantisation patterns, it achieves excellent
+quality-to-size ratios whilst working within Python-to-C++ interop constraints.
 
-## Overview
+1. [The Full Picture](#the-full-picture)
+2. [Understanding the Variants](#understanding-the-variants)
+3. [Practical Usage](#practical-usage)
+4. [The Architecture Behind the Magic](#the-architecture-behind-the-magic)
+5. [Environment and Performance](#environment-and-performance)
+6. [Output and Organisation](#output-and-organisation)
 
-This tool automates the complete quantisation workflow for converting models to GGUF format with
-multiple precision variants, importance matrix generation, and automatic upload to HuggingFace.
+## The Full Picture
 
-## Quantisation Variants
+GGUF quantisation isn't uniform precision reduction. The tool supports the complete llama.cpp
+spectrum: K-quant series (Q3_K-Q6_K) with S/M/L variants, legacy formats (Q4_0, Q5_1), experimental
+integer types (IQ2-IQ4), and full precision F16/BF16. The key is understanding strategic usage.
 
-The tool produces four quantisation variants based on Bartowski's method:
+Replicating Bartowski's patterns revealed an interesting limitation. Llama-cpp-python provides
+embedding and output layer control, but the sophisticated `tensor_types` parameter expects a C++
+`std::vector<tensor_quantization>` pointer – impossible to create from Python. This architectural
+boundary between Python and C++ cannot be worked around without significant redesign.
 
-- **Q4_K_M**: Standard baseline quantisation
-- **Q4_K_L**: Q6_K embeddings + Q6_K attention layers for better quality
-- **Q4_K_XL**: Q8_0 embeddings + Q6_K attention layers for enhanced precision
-- **Q4_K_XXL**: Q8_0 embeddings + Q8_0 attention for maximum precision
+Analysis of Bartowski's GGUF files shows this limitation doesn't matter. M variants already include
+per-layer enhancements – Q4_K_M uses Q6_K for embeddings, attention V, and FFN down layers.
+Bartowski's L and XL variants only tweak embeddings and output layers, precisely what we can control.
+Working with constraints rather than against them.
 
-## Features
+For further optimisation, importance matrix (imatrix) files guide quantisation based on usage
+patterns, outperforming fixed rules. See the [IMatrix Guide](./imatrix_data.md) for obtaining or
+generating these files – particularly crucial at lower bit rates.
 
-- **Automatic model download**: Downloads models from HuggingFace automatically
-- **Importance matrix generation**: Creates imatrix for improved quantisation quality
-- **Parallel processing**: Uploads multiple variants simultaneously
-- **Progress tracking**: Real-time status updates during conversion
-- **README generation**: Automatically creates model cards with quantisation details
-- **HuggingFace integration**: Direct upload to HuggingFace with proper metadata
+## Understanding the Variants
 
-## Usage
+Our profiles match Bartowski's exact configurations from GGUF analysis. M variants aren't middle
+ground but optimised baselines – Q4_K_M uses Q6_K for critical layers whilst maintaining Q4_K
+elsewhere, a balance proven through years of community experimentation.
 
-### Basic Usage
+L variants make minimal but impactful changes. Q4_K_L upgrades embeddings from Q6_K to Q8_0 (+19%
+size, better vocabulary). Q3_K_L upgrades output to Q5_K. Q3_K_XL combines both strategies. No
+Q4_K_XL or Q5_K_XL exist – at those sizes, Q5_K_M's superior base quantisation wins.
 
-```bash
-# Quantise a model from HuggingFace
-uv run quantise.py https://huggingface.co/meta-llama/Llama-3.2-1B
+Q5_K_L and Q6_K_L upgrade embeddings to Q8_0, providing stepping stones between major levels for
+fine-grained size-quality control. See [Bartowski Analysis](./bartowski_analysis.md) for detailed
+architectural interactions.
+
+## Practical Usage
+
+The tool handles the complete workflow: fetches from HuggingFace, converts to GGUF, checks for
+imatrix files, processes multiple variants with parallel uploads, generates documentation, and
+uploads with metadata. Fire-and-forget design – start it and return to completed models.
+
+The Python API enables custom configurations (limited to embedding and output layers due to
+llama-cpp-python constraints):
+
+```python
+from helpers.services.llama_python import LlamaCppPythonAPI
+
+api = LlamaCppPythonAPI()
+
+# Q4_K_L profile - upgrades embeddings to Q8_0
+api.quantise_model_flexible(
+    input_path="model-f16.gguf",
+    output_path="model-Q4_K_L.gguf",
+    base_type="Q4_K_M",      # Q4_K_M uses Q6_K for embeddings, attn_v, and ffn_down (not flat Q4_K!)
+    embedding_type="Q8_0",   # Further upgrade embeddings from Q6_K to Q8_0
+    output_type=None         # Keep default from base type
+)
+
+# Example 2: Q3_K_L profile - upgrades output to Q5_K
+api.quantise_model_flexible(
+    input_path="model-f16.gguf",
+    output_path="model-Q3_K_L.gguf",
+    base_type="Q3_K_M",      # Q3_K_M uses Q6_K embeddings, Q5_K attn_v, Q4_K ffn_down (not flat Q3_K!)
+    embedding_type=None,     # Keep the already-enhanced Q6_K embeddings from base
+    output_type="Q5_K"       # Upgrade output from Q4_K to Q5_K
+)
+
+# Q3_K_XL profile - upgrades both embeddings and output
+api.quantise_model_flexible(
+    input_path="model-f16.gguf",
+    output_path="model-Q3_K_XL.gguf",
+    base_type="Q3_K_M",    # Q3_K_M uses Q6_K embeddings, Q5_K attn_v, Q4_K ffn_down
+    embedding_type="Q8_0", # Further upgrade embeddings from Q6_K to Q8_0
+    output_type="Q5_K"     # Upgrade output from Q4_K to Q5_K
+)
+
+# Example 4: Custom experimental configuration
+api.quantise_model_flexible(
+    input_path="model-f16.gguf",
+    output_path="model-custom.gguf",
+    base_type="Q5_K_M",    # Q5_K_M uses Q6_K embeddings, Q6_K attn_v, Q6_K ffn_down
+    embedding_type="Q8_0", # Further upgrade embeddings from Q6_K to Q8_0
+    output_type="Q8_0"     # Upgrade output to maximum precision Q8_0
+)
 ```
 
-### Command Line Options
+Command-line usage is even simpler. Just point it at a HuggingFace model and let it work:
 
 ```bash
-# Skip imatrix generation for faster processing
-uv run quantise.py <model_url> --no-imatrix
+# Basic usage
+uv run quantise_gguf.py https://huggingface.co/meta-llama/Llama-3.2-1B
+
+# Skip imatrix checking for speed
+uv run quantise_gguf.py <model_url> --no-imatrix
 
 # Local testing without upload
-uv run quantise.py <model_url> --no-upload
+uv run quantise_gguf.py <model_url> --no-upload
 
-# Custom output directory
-uv run quantise.py <model_url> --output-dir ./my-models
-
-# Use specific HuggingFace token
-uv run quantise.py <model_url> --hf-token YOUR_TOKEN
+# Custom profiles
+uv run quantise_gguf.py <model_url> --profiles Q3_K_M Q4_K_L Q6_K
 ```
 
-## Environment Variables
+## The Architecture Behind the Magic
 
-- `HF_TOKEN`: HuggingFace API token for uploads
-- `LLAMA_CPP_DIR`: Custom path to llama.cpp binaries
-- `DEBUG`: Enable debug logging when set to "true"
+Based on Qwen3 4B analysis: embeddings (9.7% of parameters) critically affect vocabulary – Q4 to Q8
+adds just 0.17GB but dramatically improves rare tokens. Attention (14.1% total) has V layers (4.7%)
+enhanced in M variants whilst Q and K stay at base for size control.
 
-## Requirements
+Feed-forward layers show clear trade-offs: gate/up projections (44.6% of parameters) stay at base
+as enhancement would double size for modest gains. Down projections (22.3%) are enhanced in M
+variants for feature transformation quality. Output layer (9.4%) gets special attention in Q3_K_L
+for prediction quality.
 
-- **llama.cpp binaries**: `llama-quantize`, `llama-cli`, `llama-imatrix`
-- **Calibration data**: `resources/imatrix_data.txt` for importance matrix generation
-- **HuggingFace account**: For uploading quantised models (optional)
+For an 8B model: Q4_K_M baseline is ~4.5GB with Q6_K enhancements. Q4_K_L adds 753MB (5.3GB total)
+for Q8_0 embeddings. A hypothetical Q4_K_XL would reach 6.6GB – at which point Q5_K_M's superior
+base quantisation makes more sense.
 
-## Workflow
+## Environment and Performance
 
-1. **Download**: Fetches the model from HuggingFace
-2. **Convert**: Converts to initial GGUF format (F32)
-3. **Generate imatrix**: Creates importance matrix using calibration data
-4. **Quantise**: Produces multiple quantisation variants in parallel
-5. **Upload**: Pushes quantised models to HuggingFace with metadata
-6. **Clean up**: Removes temporary files and caches
+Configuration via environment variables: `HF_TOKEN` for uploads, `LLAMA_CPP_DIR` for custom
+binaries, `DEBUG=true` for verbose logging. Uses llama-cpp-python (auto-installed via uv),
+benefits from imatrix files, requires HuggingFace account only for uploads.
 
-## Output Structure
+Requirements scale predictably: disk needs ~3x model size (original, F32, outputs), memory tracks
+model size with streaming optimisations. Processing takes minutes to hours depending on size.
+Downloads range from gigabytes to 100GB+ for largest models.
 
-```plain
-output_dir/
-├── model-F32.gguf           # Full precision conversion
-├── model-Q4_K_M.gguf        # Standard quantisation
-├── model-Q4_K_M-imat.gguf   # With importance matrix
-├── model-Q4_K_L-imat.gguf   # Enhanced embeddings/attention
-├── model-Q4_K_XL-imat.gguf  # High precision embeddings
-├── model-Q4_K_XXL-imat.gguf # Maximum precision
-└── imatrix.dat              # Generated importance matrix
-```
+Comprehensive error handling: automatic retry with exponential backoff, early dependency detection,
+disk space checks, actionable API error messages, detailed conversion failure logs. Resilient
+workflow keeps you informed whilst handling large model processing challenges.
 
-## Error Handling
+## Output and Organisation
 
-The tool includes comprehensive error handling for:
+Outputs organised per model: F32/F16 base, quantisation variants, imatrix files, documentation.
+Naming pattern: `model-name-variant.gguf`. Successful uploads auto-clean local files; failures
+preserve for manual intervention. READMEs document variant characteristics and technical details.
 
-- Network failures during download
-- Missing binaries or dependencies
-- Insufficient disk space
-- HuggingFace API errors
-- Conversion failures
-
-## Performance Considerations
-
-- **Disk space**: Requires ~3x model size in free space
-- **Memory**: Needs RAM proportional to model size
-- **Processing time**: Varies from minutes to hours based on model size
-- **Network**: Downloads can be large (10-100+ GB for large models)
+Uploads include metadata, quantisation tags, and model cards explaining trade-offs. Parallel upload
+system maximises throughput with full progress visibility.
diff --git a/docs/safetensors2gguf.md b/docs/safetensors2gguf.md
index 579980a..9915fbc 100644
--- a/docs/safetensors2gguf.md
+++ b/docs/safetensors2gguf.md
@@ -1,164 +1,272 @@
-# direct_safetensors_to_gguf.py - Direct SafeTensors Conversion
+# safetensors2gguf.py - Direct SafeTensors Conversion
 
-Direct SafeTensors to GGUF converter for unsupported architectures.
+When llama.cpp doesn't recognise your model architecture, this tool provides direct SafeTensors to
+GGUF conversion. It bypasses llama.cpp's architecture-specific logic for experimental models and
+custom architectures that lack official support.
 
 ## Overview
 
-This tool converts SafeTensors models directly to GGUF format without requiring specific
-architecture support in llama.cpp. It's particularly useful for experimental models, custom
-architectures, or when llama.cpp's standard conversion tools don't recognise your model
-architecture.
+Most transformer models share common tensor patterns regardless of architecture. While llama.cpp
+requires explicit support for each architecture, this tool maps tensor names to GGUF conventions
+and preserves metadata. Works well for models following standard transformer patterns.
 
 ## Features
 
-- **Architecture-agnostic**: Works with unsupported model architectures
-- **Automatic mapping**: Intelligently maps tensor names to GGUF conventions
-- **BFloat16 support**: Handles BF16 tensors with PyTorch (optional)
-- **Vision models**: Supports models with vision components
-- **Tokeniser preservation**: Extracts and includes tokeniser metadata
-- **Fallback mechanisms**: Provides sensible defaults for unknown architectures
+The converter handles real-world models pragmatically:
+
+- **Architecture-agnostic conversion**: Pattern matching identifies common tensor types – embeddings
+  look similar across Llama, Qwen, or custom architectures
+- **Intelligent tensor mapping**: Maps standard patterns (self_attn.q_proj → attn_q) whilst
+  preserving unrecognised tensors rather than dropping them
+- **BFloat16 handling**: Optional PyTorch for BF16→F32 conversion as many models ship in BF16
+- **Vision model support**: Extracts vision tower parameters for multimodal models
+- **Tokeniser preservation**: Copies configuration wholesale to prevent garbled output from mismatches
+- **Graceful fallbacks**: Unknown architectures default to Llama structure – effective since most
+  models derive from Llama
 
 ## Usage
 
+Point at a model directory and the tool handles the rest. Most models convert with defaults, though
+forcing architecture helps when autodetection fails.
+
 ### Basic Usage
 
 ```bash
-# Convert a local SafeTensors model
-uv run direct_safetensors_to_gguf.py /path/to/model/directory
+# Convert a local SafeTensors model - autodetects architecture
+uv run safetensors2gguf.py /path/to/model/directory
 ```
 
 ### Command Line Options
 
 ```bash
-# Specify output file
-uv run direct_safetensors_to_gguf.py /path/to/model -o output.gguf
+# Specify output location - useful for organising converted models
+uv run safetensors2gguf.py /path/to/model -o output.gguf
 
-# Force specific architecture mapping
-uv run direct_safetensors_to_gguf.py /path/to/model --force-arch qwen2
+# Force architecture when autodetection fails or for better compatibility
+uv run safetensors2gguf.py /path/to/model --force-arch qwen2
 
-# Convert with custom output path
-uv run direct_safetensors_to_gguf.py ./my-model --output ./converted/my-model.gguf
+# Convert with full path control - keeps originals safe
+uv run safetensors2gguf.py ./my-model --output ./converted/my-model.gguf
 ```
 
 ## Supported Input Formats
 
-The tool automatically detects and handles:
+The tool handles all packaging formats. Sharding emerged when models exceeded file system limits –
+a 70B model spans dozens of files. Reassembles fragments transparently whether HuggingFace numbered
+shards or custom splits.
 
-1. **Single file models**: `model.safetensors`
-2. **Sharded models**: `model-00001-of-00005.safetensors`, etc.
-3. **Custom names**: Any `*.safetensors` files in the directory
+1. **Single file models**: `model.safetensors` – common for models under 10GB
+2. **Sharded models**: `model-00001-of-00005.safetensors` – standard for large models, tool
+   automatically finds and merges all shards in sequence
+3. **Custom names**: Any `*.safetensors` files – some fine-tunes use non-standard naming, tool
+   scans for all SafeTensors files regardless of naming convention
 
 ## Architecture Mapping
 
-The tool includes built-in mappings for several architectures:
+Architecture mapping bridges naming chaos and GGUF's structured expectations. Model creators invent
+their own names, but patterns remain similar underneath. Translation table for known architectures,
+unknowns default to Llama – reasonable since most models are Llama-inspired.
 
-- `DotsOCRForCausalLM` → `qwen2`
-- `GptOssForCausalLM` → `llama`
-- Unknown architectures → `llama` (fallback)
+Built-in mappings reflect real-world encounters:
 
-You can override these with the `--force-arch` parameter.
+- `DotsOCRForCausalLM` → `qwen2` – Dots OCR models are Qwen2-based despite the naming
+- `GptOssForCausalLM` → `llama` – Generic GPT models usually follow Llama architecture
+- Unknown architectures → `llama` – Safe default that works for most transformer models
+
+Use `--force-arch` when you know better than autodetection. Particularly useful for fine-tuned
+models with custom names but standard structure.
 
 ## Tensor Name Mapping
 
-The converter automatically maps common tensor patterns:
+Tensor naming diverges most between formats. HuggingFace uses verbose hierarchical names
+(`model.layers.0.self_attn.q_proj.weight`), GGUF prefers terse (`blk.0.attn_q`). Mapping preserves
+semantics whilst adapting conventions, enabling cross-ecosystem compatibility with llama.cpp.
 
-| Original Pattern | GGUF Name |
-|-----------------|-----------|
-| `model.embed_tokens.weight` | `token_embd.weight` |
-| `model.norm.weight` | `output_norm.weight` |
-| `lm_head.weight` | `output.weight` |
-| `layers.N.self_attn.q_proj` | `blk.N.attn_q` |
-| `layers.N.self_attn.k_proj` | `blk.N.attn_k` |
-| `layers.N.self_attn.v_proj` | `blk.N.attn_v` |
-| `layers.N.mlp.gate_proj` | `blk.N.ffn_gate` |
-| `layers.N.mlp.up_proj` | `blk.N.ffn_up` |
-| `layers.N.mlp.down_proj` | `blk.N.ffn_down` |
+| Original Pattern | GGUF Name | Purpose |
+|-----------------|-----------|------|
+| `model.embed_tokens.weight` | `token_embd.weight` | Token embeddings – maps input IDs to vectors |
+| `model.norm.weight` | `output_norm.weight` | Final layer normalisation before output |
+| `lm_head.weight` | `output.weight` | Output projection to vocabulary space |
+| `layers.N.self_attn.q_proj` | `blk.N.attn_q` | Query projection for attention layer N |
+| `layers.N.self_attn.k_proj` | `blk.N.attn_k` | Key projection for attention layer N |
+| `layers.N.self_attn.v_proj` | `blk.N.attn_v` | Value projection for attention layer N |
+| `layers.N.mlp.gate_proj` | `blk.N.ffn_gate` | Gate projection in feedforward network |
+| `layers.N.mlp.up_proj` | `blk.N.ffn_up` | Up projection expanding hidden dimension |
+| `layers.N.mlp.down_proj` | `blk.N.ffn_down` | Down projection reducing to model dimension |
+
+Pattern matching handles variations like `transformer.h.N` (GPT-style) or `model.decoder.layers.N`
+(encoder-decoder) by identifying core patterns regardless of prefix.
 
 ## Configuration Requirements
 
-The model directory must contain:
+Conversion requires core files though optional components are forgiven. HuggingFace downloads
+typically include everything, manually assembled models may lack critical configuration.
 
-- **config.json**: Model configuration file (required)
-- **\*.safetensors**: One or more SafeTensors files (required)
-- **tokenizer_config.json**: Tokeniser configuration (optional)
-- **tokenizer.json**: Tokeniser data (optional)
+Required files:
+
+- **config.json**: Architecture name, layer counts, vocabulary size – essential for structuring GGUF
+- **\*.safetensors**: Model weights, single or sharded – handled automatically
+
+Optional but recommended:
+
+- **tokenizer_config.json**: Special tokens, chat templates, tokeniser behaviour – missing often
+  causes garbled output
+- **tokenizer.json**: Vocabulary and merge rules – tool extracts from other sources if missing but
+  inclusion ensures compatibility
 
 ## Output Format
 
-The tool produces a single GGUF file containing:
+GGUF bundles everything for inference in one file, unlike SafeTensors' scattered JSON configuration.
+Simplifies deployment but requires careful metadata preservation during conversion.
 
-- All model weights in F32 format
-- Model architecture metadata
-- Tokeniser configuration (if available)
-- Special token IDs (BOS, EOS, UNK, PAD)
+The output file contains:
+
+- **Model weights in F32**: Full precision, quantise later with dedicated tools
+- **Architecture metadata**: Layer counts, dimensions, activations for model graph construction
+- **Tokeniser configuration**: Vocabulary, special tokens, chat templates for model behaviour
+- **Special token mappings**: BOS, EOS, UNK, PAD – control generation, must match training config
 
 ## Error Handling
 
+Error messages are actionable – explaining what went wrong, why it matters, and how to fix it.
+
 | Error | Message | Solution |
 |-------|---------|----------|
-| Missing config.json | `FileNotFoundError: Config file not found` | Ensure the model directory contains a valid `config.json` file |
-| No SafeTensors files | `FileNotFoundError: No safetensor files found` | Check that the directory contains `.safetensors` files |
-| BFloat16 without PyTorch | `Warning: PyTorch not available, BFloat16 models may not convert properly` | Install PyTorch for BF16 support: `uv add torch` |
-| Unknown architecture | `Warning: Unknown architecture X, using llama as fallback` | Use `--force-arch` to specify a known compatible architecture |
+| Missing config.json | `FileNotFoundError: Config file not found` | Download the complete model including config.json, not just weights |
+| No SafeTensors files | `FileNotFoundError: No safetensor files found` | Verify the model uses SafeTensors format – older models might use PyTorch .bin files |
+| BFloat16 without PyTorch | `Warning: PyTorch not available, BFloat16 models may not convert properly` | Install PyTorch (`uv add torch`) or accept potential precision loss in BF16→F32 conversion |
+| Unknown architecture | `Warning: Unknown architecture X, using llama as fallback` | Research the model's base architecture and use `--force-arch` with the appropriate type |
 
 ## Technical Details
 
 ### Parameter Inference
 
-The tool infers GGUF parameters from the model configuration:
+Parameter inference bridges naming inconsistencies. Llama's `num_attention_heads` is GPT's
+`n_heads`. Translation layer provides sensible defaults for missing values.
 
-- `vocab_size` → vocabulary size (default: 32000)
-- `max_position_embeddings` → context length (default: 2048)
-- `hidden_size` → embedding dimension (default: 4096)
-- `num_hidden_layers` → number of transformer blocks (default: 32)
-- `num_attention_heads` → attention head count (default: 32)
-- `num_key_value_heads` → KV head count (defaults to attention heads)
-- `rope_theta` → RoPE frequency base (default: 10000.0)
-- `rms_norm_eps` → layer normalisation epsilon (default: 1e-5)
+Configuration mapping with defaults chosen from common models:
+
+- `vocab_size` → vocabulary size (default: 32000 – Llama's vocabulary)
+- `max_position_embeddings` → context length (default: 2048 – conservative for compatibility)
+- `hidden_size` → embedding dimension (default: 4096 – typical for 7B models)
+- `num_hidden_layers` → transformer blocks (default: 32 – standard for 7B models)
+- `num_attention_heads` → attention heads (default: 32 – balanced for 4096 dimension)
+- `num_key_value_heads` → KV heads for GQA (defaults to attention heads – assumes MHA not GQA)
+- `rope_theta` → RoPE frequency base (default: 10000.0 – standard RoPE configuration)
+- `rms_norm_eps` → layer normalisation epsilon (default: 1e-5 – numerical stability threshold)
+
+Defaults work for most models. Wrong parameters may not error immediately but degrade output quality.
 
 ### Vision Model Support
 
-For models with vision components, the tool extracts:
+Multimodal models increasingly common. Tool preserves vision tower configuration though GGUF support
+remains experimental. Vision parameters extracted but may not be fully utilised.
 
-- Vision embedding dimensions
-- Vision transformer block count
-- Vision attention heads
-- Vision feed-forward dimensions
-- Patch size and spatial merge parameters
+Extracted vision parameters:
+
+- **Vision embedding dimensions**: Hidden size, typically differs from language dimensions
+- **Vision transformer blocks**: Encoder layers, fewer but wider than language
+- **Vision attention heads**: Usually standard MHA rather than grouped-query
+- **Feed-forward dimensions**: Different expansion ratios from language FFN
+- **Patch configuration**: Size (14×14), spatial merging, position encoding
+
+Vision support best-effort – preserves what's found, can't guarantee inference engine usage.
 
 ## Limitations
 
-- **F32 only**: Currently outputs only full precision (F32) models
-- **Architecture guessing**: May require manual architecture specification
-- **Tokeniser compatibility**: Uses llama tokeniser as default fallback
-- **Memory usage**: Requires loading full tensors into memory
+Understanding limitations prevents frustration. Design favours broad compatibility over perfection.
+
+- **F32 output only**: Quantisation requires separate tools like quantise_gguf.py for bit depth control
+- **Architecture guessing**: Works for common patterns, novel architectures need manual specification
+- **Tokeniser compatibility**: Falls back to Llama tokeniser when data missing – may cause issues with
+  special tokens
+- **Memory requirements**: Loads entire tensors into RAM – 70B model needs 140GB+, no streaming support
+- **No quantisation**: Preserves full precision, quantise separately for deployment control
+- **Limited validation**: Ensures structure, can't verify output quality – test before deployment
 
 ## Examples
 
 ### Converting a custom model
 
+Typical workflow: download from HuggingFace, convert to GGUF, quantise for deployment. This tool
+handles the SafeTensors→GGUF transformation.
+
 ```bash
-# Download a model first
+# Download complete model with all configuration files
 git clone https://huggingface.co/my-org/my-model ./my-model
 
-# Convert to GGUF
-uv run direct_safetensors_to_gguf.py ./my-model
+# Convert to GGUF - automatic architecture detection
+uv run safetensors2gguf.py ./my-model
 
-# Output will be at ./my-model/my-model-f32.gguf
+# Output appears at ./my-model/my-model-f32.gguf
+# Now ready for quantisation with quantise_gguf.py
 ```
 
 ### Converting with specific architecture
 
+Force architecture when autodetection fails or you know the model's lineage. Useful for fine-tuned
+models with custom names.
+
 ```bash
-# For a Qwen2-based model
-uv run direct_safetensors_to_gguf.py ./qwen-model --force-arch qwen2
+# Force Qwen2 architecture for a model you know is Qwen2-based
+uv run safetensors2gguf.py ./qwen-model --force-arch qwen2
+
+# Common forced architectures:
+# --force-arch llama    # Most models
+# --force-arch qwen2    # Qwen family
+# --force-arch mistral  # Mistral variants
 ```
 
 ### Batch conversion
 
+Bash loops enable bulk conversion for comparing checkpoints or converting model families.
+
 ```bash
-# Convert multiple models
+# Convert directory of models, preserving originals
 for model in ./models/*; do
-    uv run direct_safetensors_to_gguf.py "$model" -o "./gguf/$(basename $model).gguf"
+    echo "Converting $(basename $model)..."
+    uv run safetensors2gguf.py "$model" \
+        -o "./gguf/$(basename $model).gguf" 2>&1 | \
+        tee "./logs/$(basename $model).log"
 done
+
+# Check results
+ls -lh ./gguf/*.gguf
 ```
+
+## Integration with Quantisation Pipeline
+
+Tool produces F32 GGUF ready for quantisation. Typical pipeline:
+
+1. **Download model**: Get SafeTensors model from HuggingFace
+2. **Convert to GGUF**: Use this tool for architecture-agnostic conversion
+3. **Quantise**: Apply quantise_gguf.py for Bartowski-style variants
+4. **Deploy**: Use with llama.cpp, Ollama, or other GGUF-compatible inference engines
+
+Separation enables control at each stage. Convert once, quantise to multiple bit depths, test
+configurations without repeating conversion.
+
+## Troubleshooting
+
+### Model produces gibberish after conversion
+
+Indicates tokeniser mismatch. Ensure tokenizer.json and tokenizer_config.json present. Custom
+tokenisers may need `--force-arch`.
+
+### Conversion succeeds but model won't load
+
+Use recent llama.cpp – GGUF format evolves, older versions lack newer metadata support. Verify
+forced architecture matches actual structure – wrong forcing creates invalid models.
+
+### Out of memory during conversion
+
+Tool loads all weights simultaneously. For large models:
+
+- Close other applications to free RAM
+- Use a system with more memory (cloud instances work well)
+- Consider quantising from a pre-converted F16 model if available
+
+### Warning about unknown tensors
+
+Normal for custom layers. Preserves unknown tensors though inference may not use them. Harmless –
+better to include unused weights than miss critical ones.
diff --git a/helpers/config/__init__.py b/helpers/config/__init__.py
index ad45136..b8021ee 100644
--- a/helpers/config/__init__.py
+++ b/helpers/config/__init__.py
@@ -1,6 +1,6 @@
 """Configuration module for quantisation settings and tensor-level precision control.
 
-Provides structured configuration definitions for Bartowski quantisation methods
-including Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with fallback strategies
+Provides structured configuration definitions for custom quantisation methods
+including Q4_K_M, Q4_K_L, and Q4_K_XL variants with fallback strategies
 for different model architectures and deployment scenarios.
 """
diff --git a/helpers/config/quantisation_configs.py b/helpers/config/quantisation_configs.py
index c74b83c..015951c 100644
--- a/helpers/config/quantisation_configs.py
+++ b/helpers/config/quantisation_configs.py
@@ -1,7 +1,9 @@
 """Quantisation configuration definitions.
 
-Pre-defined quantisation configurations for the Bartowski method, supporting
-Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with tensor-level precision control.
+Comprehensive quantisation configurations supporting Q2-Q8 and F32, including
+standard profiles and custom Bartowski method variants with tensor-level precision
+control. Allows flexible combinations of base quantisation with tensor-specific
+overrides for embeddings, attention, and feed-forward layers.
 """
 
 from __future__ import annotations
@@ -9,87 +11,192 @@ from __future__ import annotations
 from helpers.models.quantisation import QuantisationConfig, QuantisationType
 
 QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
+    # Standard quantisation profiles
+    QuantisationType.Q2_K: QuantisationConfig(
+        name="Q2_K",
+        description="Q2_K quantisation (smallest, lowest quality)",
+        base_precision=2,
+        base_type="Q2_K",
+    ),
+    QuantisationType.Q2_K_S: QuantisationConfig(
+        name="Q2_K_S",
+        description="Q2_K_S quantisation (small variant)",
+        base_precision=2,
+        base_type="Q2_K_S",
+    ),
+    QuantisationType.Q3_K_S: QuantisationConfig(
+        name="Q3_K_S",
+        description="Q3_K_S quantisation (small variant)",
+        base_precision=3,
+        base_type="Q3_K_S",
+    ),
+    QuantisationType.Q3_K_M: QuantisationConfig(
+        name="Q3_K_M",
+        description="Q3_K_M quantisation (medium variant)",
+        base_precision=3,
+        base_type="Q3_K_M",
+        inherent_enhancements={
+            "embeddings": "Q6_K",
+            "attention_v": "Q5_K",
+            "ffn_down": "Q4_K",
+        },
+    ),
+    QuantisationType.Q3_K_L: QuantisationConfig(
+        name="Q3_K_L",
+        description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
+        base_type="Q3_K_M",
+        base_precision=3,
+        output_type="Q5_K",
+    ),
+    QuantisationType.Q3_K_XL: QuantisationConfig(
+        name="Q3_K_XL",
+        description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
+        base_type="Q3_K_M",
+        base_precision=3,
+        embedding_type="Q8_0",
+        output_type="Q6_K",
+    ),
+    QuantisationType.Q4_K_S: QuantisationConfig(
+        name="Q4_K_S",
+        description="Q4_K_S quantisation (small variant)",
+        base_precision=4,
+        base_type="Q4_K_S",
+    ),
     QuantisationType.Q4_K_M: QuantisationConfig(
         name="Q4_K_M",
         description="Standard Q4_K_M quantisation (baseline)",
-        tensor_types={},  # No special tensor overrides - uses default Q4_K_M
-        fallback_methods=[],
+        base_precision=4,
+        base_type="Q4_K_M",
+        inherent_enhancements={
+            "embeddings": "Q6_K",
+            "attention_v": "Q6_K",
+            "ffn_down": "Q6_K",
+        },
     ),
     QuantisationType.Q4_K_L: QuantisationConfig(
         name="Q4_K_L",
-        description="Q6_K embeddings + Q6_K attention (+753MB for vocab + reasoning)",
-        tensor_types={
-            "token_embd.weight": "Q6_K",
-            "output.weight": "Q6_K",
-            "lm_head.weight": "Q6_K",
-            "blk.*.attn_q.weight": "Q6_K",
-            "blk.*.attn_k.weight": "Q6_K",
-            "blk.*.attn_v.weight": "Q6_K",
-        },
-        fallback_methods=[
-            {
-                "embed_tokens.weight": "Q6_K",
-                "output.weight": "Q6_K",
-                "lm_head.weight": "Q6_K",
-                "blk.*.attn_q.weight": "Q6_K",
-                "blk.*.attn_k.weight": "Q6_K",
-                "blk.*.attn_v.weight": "Q6_K",
-            },
-            {"token-embedding-type": "Q6_K", "output-tensor-type": "Q6_K"},
-        ],
+        description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
+        base_type="Q4_K_M",
+        base_precision=4,
+        embedding_type="Q8_0",
     ),
-    QuantisationType.Q4_K_XL: QuantisationConfig(
-        name="Q4_K_XL",
-        description="Q8_0 embeddings + Q6_K attention (+2.1GB for vocabulary + reasoning)",
-        tensor_types={
-            "token_embd.weight": "Q8_0",
-            "output.weight": "Q8_0",
-            "lm_head.weight": "Q8_0",
-            "blk.*.attn_q.weight": "Q6_K",
-            "blk.*.attn_k.weight": "Q6_K",
-            "blk.*.attn_v.weight": "Q6_K",
-        },
-        fallback_methods=[
-            {
-                "embed_tokens.weight": "Q8_0",
-                "output.weight": "Q8_0",
-                "lm_head.weight": "Q8_0",
-                "blk.*.attn_q.weight": "Q6_K",
-                "blk.*.attn_k.weight": "Q6_K",
-                "blk.*.attn_v.weight": "Q6_K",
-            },
-            {"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
-        ],
+    # Additional standard quantisation profiles
+    QuantisationType.Q5_K_S: QuantisationConfig(
+        name="Q5_K_S",
+        description="Q5_K_S quantisation (small variant, better than Q4)",
+        base_precision=5,
+        base_type="Q5_K_S",
     ),
-    QuantisationType.Q4_K_XXL: QuantisationConfig(
-        name="Q4_K_XXL",
-        description="Q8_0 embeddings + Q8_0 attention (+2.8GB total, maximum precision)",
-        tensor_types={
-            "token_embd.weight": "Q8_0",
-            "output.weight": "Q8_0",
-            "lm_head.weight": "Q8_0",
-            "blk.*.attn_q.weight": "Q8_0",
-            "blk.*.attn_k.weight": "Q8_0",
-            "blk.*.attn_v.weight": "Q8_0",
+    QuantisationType.Q5_K_M: QuantisationConfig(
+        name="Q5_K_M",
+        description="Q5_K_M quantisation (medium variant, balanced quality)",
+        base_precision=5,
+        base_type="Q5_K_M",
+        inherent_enhancements={
+            "embeddings": "Q6_K",
+            "attention_v": "Q6_K",
+            "ffn_down": "Q6_K",
         },
-        fallback_methods=[
-            {
-                "embed_tokens.weight": "Q8_0",
-                "output.weight": "Q8_0",
-                "lm_head.weight": "Q8_0",
-                "blk.*.attn_q.weight": "Q8_0",
-                "blk.*.attn_k.weight": "Q8_0",
-                "blk.*.attn_v.weight": "Q8_0",
-            },
-            {"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
-        ],
+    ),
+    QuantisationType.Q5_K_L: QuantisationConfig(
+        name="Q5_K_L",
+        description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
+        base_type="Q5_K_M",
+        base_precision=5,
+        embedding_type="Q8_0",
+    ),
+    QuantisationType.Q6_K: QuantisationConfig(
+        name="Q6_K",
+        description="Q6_K quantisation (high quality, larger size)",
+        base_precision=6,
+        base_type="Q6_K",
+        inherent_enhancements={
+            "embeddings": "Q8_0",
+            "attention_v": "Q8_0",
+            "ffn_down": "Q6_K",
+        },
+    ),
+    QuantisationType.Q6_K_L: QuantisationConfig(
+        name="Q6_K_L",
+        description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
+        base_type="Q6_K",
+        base_precision=6,
+        output_type="Q8_0",
+    ),
+    QuantisationType.Q8_0: QuantisationConfig(
+        name="Q8_0",
+        description="Q8_0 quantisation (highest quality, largest size)",
+        base_precision=8,
+        base_type="Q8_0",
+    ),
+    # Legacy formats
+    QuantisationType.Q4_0: QuantisationConfig(
+        name="Q4_0",
+        description="Legacy Q4_0 quantisation",
+        base_precision=4,
+        base_type="Q4_0",
+    ),
+    QuantisationType.Q4_1: QuantisationConfig(
+        name="Q4_1",
+        description="Legacy Q4_1 quantisation",
+        base_precision=4,
+        base_type="Q4_1",
+    ),
+    QuantisationType.Q5_0: QuantisationConfig(
+        name="Q5_0",
+        description="Legacy Q5_0 quantisation",
+        base_precision=5,
+        base_type="Q5_0",
+    ),
+    QuantisationType.Q5_1: QuantisationConfig(
+        name="Q5_1",
+        description="Legacy Q5_1 quantisation",
+        base_precision=5,
+        base_type="Q5_1",
     ),
 }
 
 
-SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
+# Default profile set for optimal quality/size balance
+DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
+    QuantisationType.Q3_K_M,
+    QuantisationType.Q3_K_L,
+    QuantisationType.Q3_K_XL,
     QuantisationType.Q4_K_M,
     QuantisationType.Q4_K_L,
-    QuantisationType.Q4_K_XL,
-    QuantisationType.Q4_K_XXL,
+    QuantisationType.Q5_K_M,
+    QuantisationType.Q5_K_L,
+    QuantisationType.Q6_K,
+    QuantisationType.Q6_K_L,
+    QuantisationType.Q8_0,
+]
+
+
+SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
+    # Q2 variants
+    QuantisationType.Q2_K,
+    QuantisationType.Q2_K_S,
+    # Q3 K-quants
+    QuantisationType.Q3_K_S,
+    QuantisationType.Q3_K_M,
+    QuantisationType.Q3_K_L,
+    QuantisationType.Q3_K_XL,
+    # Q4 K-quants
+    QuantisationType.Q4_K_S,
+    QuantisationType.Q4_K_M,
+    QuantisationType.Q4_K_L,
+    # Q5 K-quants
+    QuantisationType.Q5_K_S,
+    QuantisationType.Q5_K_M,
+    QuantisationType.Q5_K_L,
+    # Q6_K
+    QuantisationType.Q6_K,
+    QuantisationType.Q6_K_L,
+    # Q8_0
+    QuantisationType.Q8_0,
+    # Legacy formats
+    QuantisationType.Q4_0,
+    QuantisationType.Q4_1,
+    QuantisationType.Q5_0,
+    QuantisationType.Q5_1,
 ]
diff --git a/helpers/logger.py b/helpers/logger.py
index 30d50ee..5000f09 100644
--- a/helpers/logger.py
+++ b/helpers/logger.py
@@ -47,8 +47,8 @@ class ColourFormatter(LoggingFormatter):
 
     # Emoji prefixes for different levels
     EMOJIS: ClassVar[dict[int, str]] = {
-        DEBUG: "🔍",
-        INFO: "ℹ️ ",  # noqa: RUF001
+        DEBUG: "",  # No emoji for debug logs
+        INFO: "",  # No emoji for regular info logs
         WARNING: "⚠️ ",
         ERROR: "❌",
         CRITICAL: "🔥",
@@ -69,8 +69,9 @@ class ColourFormatter(LoggingFormatter):
         colour = self.COLOURS.get(record.levelno, "")
         emoji = self.EMOJIS.get(record.levelno, "")
 
-        # Format the message
-        record.msg = f"{emoji} {record.msg}"
+        # Format the message with emoji (add space only if emoji exists)
+        if emoji:
+            record.msg = f"{emoji} {record.msg}"
         formatted = super().format(record)
 
         # Add colour codes
diff --git a/helpers/models/__init__.py b/helpers/models/__init__.py
index 5733579..6d8b8a3 100644
--- a/helpers/models/__init__.py
+++ b/helpers/models/__init__.py
@@ -3,33 +3,3 @@
 This module provides structured data models for quantisation and conversion
 operations, ensuring type safety and validation across the toolset.
 """
-
-from __future__ import annotations
-
-from helpers.models.conversion import (
-    GGUFParameters,
-    ModelConfig,
-    TensorMapping,
-    VisionConfig,
-)
-from helpers.models.quantisation import (
-    LlamaCppEnvironment,
-    ModelSource,
-    QuantisationConfig,
-    QuantisationResult,
-    QuantisationType,
-    URLType,
-)
-
-__all__ = [
-    "GGUFParameters",
-    "LlamaCppEnvironment",
-    "ModelConfig",
-    "ModelSource",
-    "QuantisationConfig",
-    "QuantisationResult",
-    "QuantisationType",
-    "TensorMapping",
-    "URLType",
-    "VisionConfig",
-]
diff --git a/helpers/models/quantisation.py b/helpers/models/quantisation.py
index 1621de9..2776256 100644
--- a/helpers/models/quantisation.py
+++ b/helpers/models/quantisation.py
@@ -7,37 +7,64 @@ conventions throughout (quantisation, not quantization).
 
 from __future__ import annotations
 
+import re
+from collections import defaultdict
 from enum import StrEnum
-from typing import TYPE_CHECKING
+from pathlib import Path  # noqa: TC003
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-if TYPE_CHECKING:
-    from pathlib import Path
+from pydantic import BaseModel, ConfigDict, field_validator
 
 
 class QuantisationType(StrEnum):
-    """Available quantisation types for Bartowski-method GGUF model conversion.
+    """Available quantisation types for GGUF model conversion.
 
-    Defines the specific quantisation strategies supported by this tool, ranging
-    from Q4_K_M baseline to Q4_K_XXL maximum precision variants. Each type
-    represents different trade-offs between model size and quality preservation
-    for embeddings, attention layers, and feed-forward networks.
+    Comprehensive set of quantisation strategies from Q2 to Q8, including
+    K-quants and legacy formats. Each type represents different trade-offs
+    between model size, inference speed, and quality preservation. Custom
+    variants (L, XL, XXL) enable tensor-specific precision control for
+    embeddings, attention layers, and feed-forward networks.
     """
 
-    Q4_K_M = "Q4_K_M"
-    Q4_K_L = "Q4_K_L"
-    Q4_K_XL = "Q4_K_XL"
-    Q4_K_XXL = "Q4_K_XXL"
+    # Q2 variants (smallest, lowest quality)
+    Q2_K = "Q2_K"
+    Q2_K_S = "Q2_K_S"
+
+    # Q3 K-quants
+    Q3_K_S = "Q3_K_S"
+    Q3_K_M = "Q3_K_M"  # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down
+    Q3_K_L = "Q3_K_L"  # Bartowski: Upgrades output to Q5_K (from M baseline)
+    Q3_K_XL = "Q3_K_XL"  # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline)
+
+    # Q4 K-quants (most popular)
+    Q4_K_S = "Q4_K_S"
+    Q4_K_M = "Q4_K_M"  # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
+    Q4_K_L = "Q4_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
+
+    # Q5 K-quants
+    Q5_K_S = "Q5_K_S"
+    Q5_K_M = "Q5_K_M"  # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down
+    Q5_K_L = "Q5_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (from M baseline)
+
+    # Q6_K variants
+    Q6_K = "Q6_K"
+    Q6_K_L = "Q6_K_L"  # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K)
+
+    # Q8_0 (highest common quantisation)
+    Q8_0 = "Q8_0"
+
+    # Legacy quantisation formats
+    Q4_0 = "Q4_0"
+    Q4_1 = "Q4_1"
+    Q5_0 = "Q5_0"
+    Q5_1 = "Q5_1"
 
 
 class URLType(StrEnum):
     """Supported URL formats for model source specification.
 
-    Categorises input URL formats to enable appropriate handling strategies.
-    HuggingFace URLs require full model download and conversion, whilst Ollama
-    GGUF URLs allow direct GGUF file downloads with pattern matching for
-    efficient processing of pre-quantised models.
+    Categorises input URL formats to enable appropriate handling strategies. HuggingFace URLs
+    require full model download and conversion, whilst Ollama GGUF URLs allow direct GGUF file
+    downloads with pattern matching for efficient processing of pre-quantised models.
     """
 
     HUGGINGFACE = "huggingface"
@@ -45,20 +72,173 @@ class URLType(StrEnum):
 
 
 class QuantisationConfig(BaseModel):
-    """Configuration for a specific quantisation method with tensor-level precision control.
+    """Configuration for a specific quantisation method.
 
-    Defines quantisation parameters including tensor type mappings and fallback
-    methods for handling different model architectures. Enables fine-grained
-    control over which layers receive higher precision treatment whilst
-    maintaining compatibility across diverse model structures.
+    Defines quantisation parameters for different model variants. The L and XL variants specify a
+    base type with optional embedding and output overrides, leveraging the fact that M variants
+    already include strategic enhancements to critical layers (embeddings, attention V, and FFN
+    down).
     """
 
     model_config = ConfigDict(use_enum_values=True)
 
     name: str
     description: str
-    tensor_types: dict[str, str] = Field(default_factory=dict)
-    fallback_methods: list[dict[str, str]] = Field(default_factory=list)
+    base_precision: int  # Base precision level (2, 3, 4, 5, 6, 8)
+    base_type: str  # Base quantisation type for llama-cpp (e.g. "Q3_K_M")
+    embedding_type: str | None = None  # Override for embeddings (e.g. "Q8_0")
+    output_type: str | None = None  # Override for output layer (e.g. "Q5_K")
+    inherent_enhancements: dict[str, str] | None = None  # M variant built-in enhancements
+
+    def get_layer_config(self, configs_dict: dict | None = None) -> dict[str, str]:
+        """Get layer configuration for display purposes.
+
+        Returns layer precision specifications based on what the base_type inherently
+        does (from inherent_enhancements) plus any L/XL overrides. This is purely
+        for documentation and display - the actual quantisation uses base_type with
+        tensor-specific overrides applied directly by the quantisation engine.
+
+        Returns:
+            Dictionary mapping layer types to quantisation specifications for display.
+        """
+        # Build base quantisation string from precision
+        base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0"
+
+        # Get inherent enhancements for display - inherit from base type if this is L/XL variant
+        enhancements = self.inherent_enhancements or {}
+
+        # If this config has a base_type and no inherent enhancements, inherit for display
+        if self.base_type and self.base_type != base and not enhancements and configs_dict:
+            # Look up base type by string matching
+            for config in configs_dict.values():
+                if config.name == self.base_type:
+                    if config.inherent_enhancements:
+                        enhancements = config.inherent_enhancements
+                    break
+
+        # Start with what the base type inherently does
+        embed = enhancements.get("embeddings", base)
+        attn_v = enhancements.get("attention_v", base)
+        ffn_down = enhancements.get("ffn_down", base)
+        output = base  # Default output to base
+
+        # Apply L/XL overrides for display (these take precedence in the display)
+        embed = self.embedding_type or embed
+        output = self.output_type or output
+
+        # Build QKV string (Q/K always use base, V may be enhanced by base type)
+        qkv = f"{base}/{base}/{attn_v}"
+
+        return {
+            "embed": embed,
+            "output": output,
+            "qkv": qkv,
+            "gate_up": base,  # Gate and up always use base quantisation
+            "down": ffn_down,  # Down uses what base type inherently does
+        }
+
+    def get_compact_config(self, configs_dict: dict | None = None) -> str:
+        """Get compact configuration string with single-letter abbreviations.
+
+        Creates a compact configuration string using E/O/A/F notation for
+        embeddings, output, attention, and FFN layers respectively. This provides
+        a concise representation of layer-specific quantisation levels for quick
+        comparison and display in user interfaces.
+
+        Returns:
+            Formatted configuration string like "Q6:E Q4:O Q3:A Q5:F".
+        """
+        layers = self.get_layer_config(configs_dict)
+
+        # Parse QKV values
+        qkv_parts = layers["qkv"].split("/")
+        q_val = qkv_parts[0] if qkv_parts else layers["qkv"]
+        k_val = qkv_parts[1] if len(qkv_parts) > 1 else q_val
+        v_val = qkv_parts[2] if len(qkv_parts) > 2 else k_val
+
+        # Special case: uniform quantisation
+        if (
+            layers["embed"]
+            == layers["output"]
+            == q_val
+            == k_val
+            == v_val
+            == layers["gate_up"]
+            == layers["down"]
+        ):
+            if self.name == "Q6_K":
+                return "Q6_K all layers"
+            if self.name == "Q8_0":
+                return "Q8_0 all layers"
+            return f"{layers['embed']} all layers"
+
+        # Build component groups
+        quant_components = defaultdict(list)
+
+        def add_component(value: str, component: str) -> None:
+            if value:
+                # Extract precision from quantisation string
+                precision = self._extract_precision(value)
+                quant_components[f"Q{precision}"].append(component)
+
+        # Add components
+        add_component(layers["embed"], "E")
+        add_component(layers["output"], "O")
+
+        # Attention components
+        if q_val == k_val == v_val:
+            add_component(q_val, "A")
+        else:
+            if q_val == k_val:
+                add_component(q_val, "Aqk")
+            else:
+                add_component(q_val, "Aq")
+                add_component(k_val, "Ak")
+            add_component(v_val, "Av")
+
+        # FFN components
+        if layers["gate_up"] == layers["down"]:
+            add_component(layers["gate_up"], "F")
+        else:
+            add_component(layers["gate_up"], "Fgu")
+            add_component(layers["down"], "Fd")
+
+        # Sort and format
+        precision_order = ["Q8", "Q6", "Q5", "Q4", "Q3", "Q2"]
+        component_order = ["E", "O", "A", "Av", "Aqk", "Aq", "Ak", "F", "Fd", "Fgu"]
+
+        sorted_quants = sorted(
+            quant_components.items(),
+            key=lambda x: precision_order.index(x[0])
+            if x[0] in precision_order
+            else len(precision_order),
+        )
+
+        components = []
+        for quant_level, parts in sorted_quants:
+            sorted_parts = sorted(
+                parts,
+                key=lambda x: component_order.index(x)
+                if x in component_order
+                else len(component_order),
+            )
+            components.append(f"{quant_level}:{'/'.join(sorted_parts)}")
+
+        return " ".join(components)
+
+    def _extract_precision(self, quant_str: str) -> int:
+        """Extract precision level from quantisation string.
+
+        Parses quantisation type strings to extract the numerical precision level.
+        Handles both K-quant formats (Q3_K, Q4_K_M) and legacy formats (Q8_0, Q5_1)
+        by matching the digit following the Q prefix.
+
+        Returns:
+            Precision level as integer, defaulting to 4 if parsing fails.
+        """
+        # Extract the digit from strings like "Q3_K", "Q8_0", "Q6_K"
+        match = re.search(r"Q(\d+)", quant_str)
+        return int(match.group(1)) if match else 4  # Default to 4 if parsing fails
 
 
 class ModelSource(BaseModel):
@@ -120,23 +300,6 @@ class QuantisationResult(BaseModel):
     status: str = "pending"  # planned, processing, uploading, completed, failed
 
 
-class LlamaCppEnvironment(BaseModel):
-    """Represents llama.cpp environment setup with binary and script locations.
-
-    Encapsulates the runtime environment for llama.cpp tools including paths
-    to quantisation binaries, CLI tools, and conversion scripts. Handles both
-    local binary installations and repository-based setups to provide flexible
-    deployment options across different system configurations.
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    quantise_binary: Path  # UK spelling
-    cli_binary: Path
-    convert_script: str
-    use_repo: bool = False
-
-
 class QuantisationContext(BaseModel):
     """Context object containing all parameters needed for quantisation execution.
 
@@ -144,12 +307,11 @@ class QuantisationContext(BaseModel):
     and improve code maintainability following parameter object pattern.
     """
 
-    model_config = ConfigDict(frozen=True)
+    model_config = ConfigDict(frozen=True, protected_namespaces=())
 
     f16_model_path: Path
     model_source: ModelSource
     config: QuantisationConfig
-    llama_env: LlamaCppEnvironment
     models_dir: Path
     imatrix_path: Path | None = None
     base_quant: str = "Q4_K_M"
diff --git a/helpers/services/__init__.py b/helpers/services/__init__.py
index 203de11..5b59db9 100644
--- a/helpers/services/__init__.py
+++ b/helpers/services/__init__.py
@@ -4,17 +4,3 @@ Provides high-level service interfaces for interacting with external systems
 including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
 spelling conventions throughout.
 """
-
-from __future__ import annotations
-
-from helpers.services.filesystem import FilesystemService
-from helpers.services.huggingface import HuggingFaceService, ReadmeGenerator
-from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
-
-__all__ = [
-    "EnvironmentManager",
-    "FilesystemService",
-    "HuggingFaceService",
-    "IMatrixGenerator",
-    "ReadmeGenerator",
-]
diff --git a/helpers/services/gguf.py b/helpers/services/gguf.py
index ba892e7..14819c5 100644
--- a/helpers/services/gguf.py
+++ b/helpers/services/gguf.py
@@ -7,7 +7,8 @@ Uses UK English spelling conventions throughout.
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+import gc
+from typing import TYPE_CHECKING, Any, Protocol
 
 import gguf
 import torch
@@ -17,6 +18,25 @@ from helpers.logger import logger
 from helpers.services.filesystem import FilesystemService
 from helpers.utils.config_parser import ConfigParser
 
+
+class VisionConfig(Protocol):
+    """Protocol for vision model configuration."""
+
+    hidden_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    intermediate_size: int
+    patch_size: int
+    spatial_merge_size: int
+
+
+class TensorMapper(Protocol):
+    """Protocol for tensor name mapping."""
+
+    def map_tensor_name(self, name: str) -> str | None:
+        """Map a tensor name to its GGUF equivalent."""
+
+
 if TYPE_CHECKING:
     from pathlib import Path
 
@@ -71,7 +91,7 @@ class GGUFWriter:
 
         logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
 
-    def add_vision_metadata(self, vision_config: Any) -> None:
+    def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
         """Add vision model parameters to GGUF metadata.
 
         Configures vision-specific parameters for multimodal models including
@@ -141,7 +161,7 @@ class GGUFConverter:
         output_path: Path,
         model_config: ModelConfig,
         architecture: str,
-        tensor_mapper: Any,
+        tensor_mapper: TensorMapper,
     ) -> bool:
         """Convert SafeTensors model to GGUF format.
 
@@ -172,7 +192,7 @@ class GGUFConverter:
         for tensor_file in tensor_files:
             logger.info(f"Loading {tensor_file.name}...")
             with safe_open(tensor_file, framework="pt") as f:
-                for tensor_name in f:
+                for tensor_name in f.keys():  # noqa: SIM118
                     tensor_data = f.get_tensor(tensor_name)
 
                     # Convert BFloat16 to Float32
@@ -191,6 +211,12 @@ class GGUFConverter:
                         if tensor_count % 100 == 0:
                             logger.info(f"  Processed {tensor_count} tensors...")
 
+                    # Free memory after processing each tensor
+                    del tensor_data
+
+            # Force garbage collection after processing each file
+            gc.collect()
+
         logger.info(f"Total tensors processed: {tensor_count}")
 
         # Add tokeniser
diff --git a/helpers/services/huggingface.py b/helpers/services/huggingface.py
index a9dd9a7..a851cff 100644
--- a/helpers/services/huggingface.py
+++ b/helpers/services/huggingface.py
@@ -8,17 +8,22 @@ spelling conventions throughout.
 from __future__ import annotations
 
 import re
+import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
 from helpers.logger import logger
 from helpers.models.quantisation import QuantisationType
 
 if TYPE_CHECKING:
     from helpers.models.quantisation import ModelSource, QuantisationResult
 
+# Constants for file size formatting
+GIBIBYTE = 1024**3
+
 
 class HuggingFaceService:
     """Manages HuggingFace repository operations.
@@ -76,7 +81,7 @@ class HuggingFaceService:
         if include_pattern:
             cmd.extend(["--include", include_pattern])
 
-        subprocess.run(cmd, check=True)
+        subprocess.run(cmd, check=True, capture_output=True, text=True)
         logger.info("Download complete")
 
     @staticmethod
@@ -89,8 +94,8 @@ class HuggingFaceService:
         """Upload a file to HuggingFace repository.
 
         Uploads a single file to the specified repository path. Can create
-        the repository if it doesn't exist. Handles repository creation conflicts
-        gracefully by retrying without the create flag when needed.
+        the repository if it doesn't exist. Uses git directly when possible
+        to avoid automatic PR creation.
 
         Raises:
             CalledProcessError: If upload fails.
@@ -98,12 +103,25 @@ class HuggingFaceService:
         repo_path = repo_path or local_path.name
         logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
 
+        # Try git-based upload first to avoid PR creation
+        if HuggingFaceService._try_git_upload(
+            repo_id, local_path, repo_path, create_repo=create_repo
+        ):
+            logger.info(f"Uploaded {repo_path} via git")
+            return
+
+        # Fallback to huggingface-cli
+        logger.info("Git upload failed, trying huggingface-cli...")
         cmd = [
             "huggingface-cli",
             "upload",
             repo_id,
             str(local_path),
             repo_path,
+            "--revision",
+            "main",  # Explicitly push to main branch
+            "--commit-message",
+            f"Add {repo_path}",
         ]
 
         if create_repo:
@@ -116,11 +134,99 @@ class HuggingFaceService:
             if create_repo:
                 # Repository might already exist, retry without --create
                 cmd = cmd[:-1]  # Remove --create flag
-                subprocess.run(cmd, check=True)
+                subprocess.run(cmd, check=True, capture_output=True, text=True)
                 logger.info(f"Updated {repo_path}")
             else:
                 raise
 
+    @staticmethod
+    def _try_git_upload(
+        repo_id: str,
+        local_path: Path,
+        repo_path: str,
+        *,
+        create_repo: bool = False,
+    ) -> bool:
+        """Try to upload file using git directly to avoid PR creation.
+
+        Returns:
+            bool: True if upload successful, False if should fallback to CLI.
+        """
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+                repo_url = f"https://huggingface.co/{repo_id}"
+
+                # Clone repository
+                logger.info(f"Cloning {repo_url}...")
+                result = subprocess.run(
+                    ["git", "clone", repo_url, str(temp_path / "repo")],
+                    check=False,
+                    capture_output=True,
+                    text=True,
+                )
+
+                if result.returncode != 0:
+                    if create_repo:
+                        # Repository doesn't exist, let huggingface-cli handle creation
+                        return False
+                    logger.warning(f"Clone failed: {result.stderr}")
+                    return False
+
+                repo_dir = temp_path / "repo"
+                target_file = repo_dir / repo_path
+
+                # Ensure target directory exists
+                target_file.parent.mkdir(parents=True, exist_ok=True)
+
+                # Copy file
+                shutil.copy2(local_path, target_file)
+
+                # Check if there are any changes
+                status_result = subprocess.run(
+                    ["git", "status", "--porcelain"],
+                    cwd=repo_dir,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+
+                if not status_result.stdout.strip():
+                    logger.info(f"No changes detected for {repo_path}, file already up-to-date")
+                    return True  # File is already up-to-date, no need to push
+
+                # Git add, commit, push
+                subprocess.run(
+                    ["git", "add", repo_path],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                subprocess.run(
+                    ["git", "commit", "-m", f"Update {repo_path}"],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                subprocess.run(
+                    ["git", "push"],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+
+                return True
+
+        except subprocess.CalledProcessError as e:
+            logger.warning(f"Git upload failed: {e}")
+            return False
+        except Exception as e:
+            logger.warning(f"Git upload error: {e}")
+            return False
+
 
 class ReadmeGenerator:
     """Generates README files for quantised models.
@@ -173,14 +279,45 @@ class ReadmeGenerator:
         """
         content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
 
-        # Try local file first
+        # Check for preserved original README first
+        original_readme_path = model_dir / "README.original.md"
         readme_path = model_dir / "README.md"
-        if readme_path.exists():
-            content["readme"] = readme_path.read_text(encoding="utf-8")
-            logger.info(f"Found original README ({len(content['readme'])} characters)")
+
+        if original_readme_path.exists():
+            # Use the preserved original
+            content["readme"] = original_readme_path.read_text(encoding="utf-8")
+            logger.info(f"Found preserved original README ({len(content['readme'])} characters)")
+        elif readme_path.exists():
+            # First time - preserve the original and use it
+            readme_content = readme_path.read_text(encoding="utf-8")
+
+            # Check if this is already our generated README
+            if (
+                f"{model_source.original_author}-{model_source.model_name}-GGUF"
+                not in readme_content
+            ):
+                # This is the original - preserve it
+                original_readme_path.write_text(readme_content, encoding="utf-8")
+                content["readme"] = readme_content
+                readme_len = len(content["readme"])
+                logger.info(
+                    f"Preserved original README as README.original.md ({readme_len} characters)"
+                )
+            else:
+                # This is our generated README, need to download the original
+                logger.info("Found generated README, downloading original from source")
+                content = self._download_readme(model_source)
+                # Save the downloaded original for future use
+                if content["readme"]:
+                    original_readme_path.write_text(content["readme"], encoding="utf-8")
+                    logger.info("Preserved downloaded original README as README.original.md")
         else:
-            # Download separately
+            # No local README - download from source
             content = self._download_readme(model_source)
+            # Save the downloaded original for future use
+            if content["readme"]:
+                original_readme_path.write_text(content["readme"], encoding="utf-8")
+                logger.info("Preserved downloaded original README as README.original.md")
 
         # Parse frontmatter if present
         if content["readme"].startswith("---\n"):
@@ -303,10 +440,16 @@ class ReadmeGenerator:
         our_tags = [
             "quantised",
             "gguf",
+            "q3_k_m",
+            "q3_k_l",
+            "q3_k_xl",
             "q4_k_m",
             "q4_k_l",
-            "q4_k_xl",
-            "q4_k_xxl",
+            "q5_k_m",
+            "q5_k_l",
+            "q6_k",
+            "q6_k_l",
+            "q8_0",
             "bartowski-method",
         ]
         original_tags = original_content["tags"].split(",") if original_content["tags"] else []
@@ -329,62 +472,78 @@ tags:
         hf_url = f"https://huggingface.co/{model_source.source_model}"
         content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
 
-GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's method.
+GGUF quantisations of [{model_source.source_model}]({hf_url}) using
+[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
+which replicates Bartowski's quantisation profiles.
 
-| Quantisation | Embeddings/Output | Attention | Feed-Forward | Status |
-|--------------|-------------------|-----------|--------------|--------|
+| Variant | Configuration | File Size | Status |
+|---|---|---|---|
 """
 
-        # Add results table
-        for quant_type in [
+        # Add results table - group by layer config patterns
+        supported_types = [
+            QuantisationType.Q3_K_M,
+            QuantisationType.Q3_K_L,
+            QuantisationType.Q3_K_XL,
             QuantisationType.Q4_K_M,
             QuantisationType.Q4_K_L,
-            QuantisationType.Q4_K_XL,
-            QuantisationType.Q4_K_XXL,
-        ]:
+            QuantisationType.Q5_K_M,
+            QuantisationType.Q5_K_L,
+            QuantisationType.Q6_K,
+            QuantisationType.Q6_K_L,
+            QuantisationType.Q8_0,
+        ]
+
+        for quant_type in supported_types:
             result = results.get(quant_type)
             if not result:
                 result = type("Result", (), {"status": "planned", "success": False})()
 
-            layers = self._get_layers_config(quant_type)
+            config = QUANTISATION_CONFIGS.get(quant_type)
+            file_size = self._format_file_size(result)
             status = self._format_status(result, model_source, quant_type, output_repo)
 
-            content += (
-                f"| {quant_type.value} | {layers['embeddings']} | "
-                f"{layers['attention']} | {layers['ffn']} | {status} |\n"
-            )
+            # Get configuration description from the config itself
+            config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
 
-        content += "\n---\n\n"
+            content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
+
+        content += """
+
+**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN
+
+See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
+for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
+for more on the tools and methods I use.
+
+"""
 
         # Add original content
         if original_content["readme"]:
-            content += "# Original Model Information\n\n" + original_content["readme"]
+            content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
         else:
-            content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model}).\n"
+            content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model})."
 
         return frontmatter + content
 
-    def _get_layers_config(self, quant_type: QuantisationType) -> dict[str, str]:
-        """Get layer configuration for quantisation type.
-
-        Returns layer precision specifications for the quantisation table.
+    def _format_file_size(self, result: QuantisationResult) -> str:
+        """Format file size for README table.
 
         Returns:
-            Dictionary with embeddings, attention, and ffn precision labels.
+            Formatted file size string or dash if not available.
         """
-        configs = {
-            QuantisationType.Q4_K_M: {
-                "embeddings": "Q4_K_M",
-                "attention": "Q4_K_M",
-                "ffn": "Q4_K_M",
-            },
-            QuantisationType.Q4_K_L: {"embeddings": "Q6_K", "attention": "Q6_K", "ffn": "Q4_K_M"},
-            QuantisationType.Q4_K_XL: {"embeddings": "Q8_0", "attention": "Q6_K", "ffn": "Q4_K_M"},
-            QuantisationType.Q4_K_XXL: {"embeddings": "Q8_0", "attention": "Q8_0", "ffn": "Q4_K_M"},
-        }
-        return configs.get(
-            quant_type, {"embeddings": "Unknown", "attention": "Unknown", "ffn": "Unknown"}
-        )
+        if hasattr(result, "file_size") and result.file_size:
+            return result.file_size
+        if hasattr(result, "success") and result.success and hasattr(result, "file_path"):
+            # Try to get file size from path if available
+            try:
+                if result.file_path and Path(result.file_path).exists():
+                    size_bytes = Path(result.file_path).stat().st_size
+                    size_gb = size_bytes / GIBIBYTE
+                    return f"{size_gb:.1f}GB"
+            except Exception:
+                pass
+        return "-"
 
     def _format_status(
         self,
@@ -402,7 +561,7 @@ GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's
             Formatted status string for table cell.
         """
         status_map = {
-            "planned": "⏳ Planned",
+            "planned": "⏳ Queued",
             "processing": "🔄 Processing...",
             "uploading": "⬆️ Uploading...",
             "failed": "❌ Failed",
diff --git a/helpers/services/llama_cpp.py b/helpers/services/llama_cpp.py
index 86f49e2..418f965 100644
--- a/helpers/services/llama_cpp.py
+++ b/helpers/services/llama_cpp.py
@@ -1,198 +1,42 @@
-"""llama.cpp environment and operations service.
+"""Importance matrix (imatrix) management service.
 
-Manages llama.cpp binary discovery, environment setup, and imatrix generation.
-Provides consistent interface for interacting with llama.cpp tools across
-different installation methods.
+Manages detection and use of existing importance matrix files for
+quantisation guidance. Provides user prompts for supplying pre-computed
+imatrix files from external sources.
 """
 
 from __future__ import annotations
 
-import subprocess
-from pathlib import Path
+from typing import TYPE_CHECKING
 
 from helpers.logger import logger
-from helpers.models.quantisation import LlamaCppEnvironment
 from helpers.services.filesystem import FilesystemService
 
-
-class EnvironmentManager:
-    """Manages llama.cpp environment setup and binary discovery.
-
-    Handles detection of local binaries, repository setup, and conversion
-    script location. Provides fallback strategies for different installation
-    scenarios including local builds and repository-based setups.
-    """
-
-    def __init__(self, work_dir: Path) -> None:
-        """Initialise EnvironmentManager."""
-        self.work_dir = work_dir
-        self.llama_cpp_dir = work_dir / "llama.cpp"
-        self.fs = FilesystemService()
-
-    def setup(self) -> LlamaCppEnvironment:
-        """Set up llama.cpp environment with automatic detection.
-
-        Checks for local llama.cpp binaries first, then falls back to
-        repository-based setup if needed. Handles conversion script location,
-        dependency installation, and path resolution.
-
-        Returns:
-            Configured LlamaCppEnvironment instance.
-        """
-        # Check for local binaries first
-        local_env = self._check_local_binaries()
-        if local_env:
-            return local_env
-
-        # Setup repository if needed
-        return self.setup_repository()
-
-    def _check_local_binaries(self) -> LlamaCppEnvironment | None:
-        """Check for existing llama.cpp binaries in current directory.
-
-        Searches for quantise and CLI binaries in the current directory
-        and standard installation paths. Also locates conversion scripts.
-
-        Returns:
-            LlamaCppEnvironment if binaries found, None otherwise.
-        """
-        quantise_bin = Path("./llama-quantize")
-        cli_bin = Path("./llama-cli")
-
-        if not (quantise_bin.exists() and cli_bin.exists()):
-            return None
-
-        logger.info("Found llama.cpp binaries in current directory")
-
-        # Check for conversion script
-        convert_script = self._find_convert_script()
-        if convert_script:
-            logger.info(f"Found conversion script: {convert_script}")
-            return LlamaCppEnvironment(
-                quantise_binary=quantise_bin.resolve(),
-                cli_binary=cli_bin.resolve(),
-                convert_script=convert_script,
-                use_repo=False,
-            )
-
-        logger.warning("No conversion script found in current directory")
-        logger.info("Will use llama.cpp repository method for conversion")
-        return LlamaCppEnvironment(
-            quantise_binary=quantise_bin.resolve(),
-            cli_binary=cli_bin.resolve(),
-            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
-            use_repo=True,
-        )
-
-    def _find_convert_script(self) -> str | None:
-        """Find conversion script in current directory.
-
-        Searches for various naming conventions of the HF to GGUF
-        conversion script.
-
-        Returns:
-            Command to run conversion script, or None if not found.
-        """
-        scripts = [
-            "./llama-convert-hf-to-gguf",
-            "python3 ./convert_hf_to_gguf.py",
-            "python3 ./convert-hf-to-gguf.py",
-        ]
-
-        for script in scripts:
-            if script.startswith("python3"):
-                script_path = script.split(" ", 1)[1]
-                if Path(script_path).exists():
-                    return script
-            elif Path(script).exists():
-                return script
-        return None
-
-    def setup_repository(self) -> LlamaCppEnvironment:
-        """Setup llama.cpp repository for conversion scripts.
-
-        Clones the llama.cpp repository if not present and installs
-        Python dependencies for model conversion.
-
-        Returns:
-            LlamaCppEnvironment configured with repository paths.
-        """
-        if not self.llama_cpp_dir.exists():
-            logger.info("Cloning llama.cpp for conversion script...")
-            subprocess.run(
-                [
-                    "git",
-                    "clone",
-                    "https://github.com/ggerganov/llama.cpp.git",
-                    str(self.llama_cpp_dir),
-                ],
-                check=True,
-            )
-
-            # Install Python requirements
-            logger.info("Installing Python requirements...")
-            subprocess.run(
-                [
-                    "pip3",
-                    "install",
-                    "-r",
-                    "requirements.txt",
-                    "--break-system-packages",
-                    "--root-user-action=ignore",
-                ],
-                cwd=self.llama_cpp_dir,
-                check=True,
-            )
-
-            # Install additional conversion dependencies
-            logger.info("Installing additional conversion dependencies...")
-            subprocess.run(
-                [
-                    "pip3",
-                    "install",
-                    "transformers",
-                    "sentencepiece",
-                    "protobuf",
-                    "--break-system-packages",
-                    "--root-user-action=ignore",
-                ],
-                check=True,
-            )
-        else:
-            logger.info("llama.cpp repository already exists")
-
-        # Use local binaries but repo conversion script
-        return LlamaCppEnvironment(
-            quantise_binary=Path("./llama-quantize").resolve(),
-            cli_binary=Path("./llama-cli").resolve(),
-            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
-            use_repo=False,
-        )
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
-class IMatrixGenerator:
-    """Handles importance matrix generation for quantisation guidance.
+class IMatrixManager:
+    """Handles importance matrix file management for quantisation.
 
-    Generates or locates importance matrices that guide quantisation
-    decisions, helping preserve model quality by identifying critical
-    tensors requiring higher precision.
+    Locates existing importance matrix files or prompts users to provide
+    pre-computed matrices from external sources. These matrices guide
+    quantisation decisions to preserve model quality.
     """
 
     def __init__(self) -> None:
-        """Initialise IMatrixGenerator."""
+        """Initialise IMatrixManager."""
         self.fs = FilesystemService()
 
-    def generate_imatrix(
-        self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
-    ) -> Path | None:
-        """Generate importance matrix for quantisation guidance.
+    def find_imatrix(self, model_dir: Path) -> Path | None:
+        """Find or prompt for importance matrix file.
 
-        Searches for existing imatrix files first, provides interactive
-        prompts for user-supplied matrices, then generates new matrices
-        using calibration data if necessary.
+        Searches for existing imatrix files first, then provides interactive
+        prompts for user-supplied matrices. See docs/imatrix_data.md for
+        instructions on generating imatrix files.
 
         Returns:
-            Path to imatrix file, or None if generation fails.
+            Path to imatrix file, or None if not available.
         """
         imatrix_path = model_dir / "imatrix.dat"
 
@@ -202,16 +46,7 @@ class IMatrixGenerator:
             return imatrix_path
 
         # Try user-provided imatrix
-        user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
-        if user_imatrix:
-            return user_imatrix
-
-        # Generate new imatrix
-        calibration_file = self._get_calibration_file()
-        if not calibration_file:
-            return None
-
-        return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)
+        return self._prompt_for_user_imatrix(model_dir, imatrix_path)
 
     def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
         """Prompt user for existing imatrix file.
@@ -221,197 +56,28 @@ class IMatrixGenerator:
         """
         logger.info(f"Model directory: {model_dir}")
         logger.info(f"Looking for imatrix file at: {imatrix_path}")
-        logger.info(
-            "Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
-        )
-        logger.info(
-            "   Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
-        )
+        logger.info("\n" + "=" * 70)
+        logger.info("📊 No existing imatrix file found")
+        logger.info("\nYou have two options:")
+        logger.info("  1. Provide a pre-computed imatrix file")
+        logger.info("     (💡 see docs/imatrix_data.md to generate your own)")
+        logger.info("  2. Skip imatrix usage (lower quality quantisation)")
+        logger.info("=" * 70)
 
-        response = (
-            input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
-            .strip()
-            .lower()
-        )
+        response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
 
         if response != "y":
+            logger.info("Continuing without imatrix (quantisation quality may be lower)")
+            logger.info("ℹ️  See docs/imatrix_data.md for instructions on generating imatrix files")  # noqa: RUF001
             return None
 
-        logger.info(f"Please place your imatrix.dat file in: {model_dir}")
-        input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
+        logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
+        input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
 
         if imatrix_path.exists():
             file_size = self.fs.get_file_size(imatrix_path)
-            logger.info(f"Found imatrix file! ({file_size})")
+            logger.info(f"✅ Found imatrix file! ({file_size})")
             return imatrix_path
 
-        logger.warning("No imatrix.dat file found - continuing with automatic generation")
-        return None
-
-    def _get_calibration_file(self) -> Path | None:
-        """Get calibration data file for imatrix generation.
-
-        Returns:
-            Path to calibration file, or None if not found.
-        """
-        calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
-        if not calibration_file.exists():
-            logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
-            logger.info(
-                "Download from: https://gist.githubusercontent.com/bartowski1182/"
-                "eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
-            )
-            return None
-        return calibration_file
-
-    def _generate_new_imatrix(
-        self,
-        f16_model_path: Path,
-        llama_env: LlamaCppEnvironment,
-        imatrix_path: Path,
-        calibration_file: Path,
-    ) -> Path | None:
-        """Generate new importance matrix using calibration data.
-
-        Returns:
-            Path to generated imatrix, or None if generation fails.
-        """
-        logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
-        logger.info(f"Model: {f16_model_path.name}")
-        logger.info(f"Calibration: {calibration_file}")
-        logger.info(f"Output: {imatrix_path}")
-
-        # Find imatrix binary
-        imatrix_binary = self._find_imatrix_binary(llama_env)
-        if not imatrix_binary:
-            logger.warning("llama-imatrix binary not found - skipping imatrix generation")
-            logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
-            return None
-
-        # Build and execute command
-        cmd = self._build_imatrix_command(
-            imatrix_binary, f16_model_path, calibration_file, imatrix_path
-        )
-        return self._execute_imatrix_generation(cmd, imatrix_path)
-
-    def _build_imatrix_command(
-        self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
-    ) -> list[str]:
-        """Build imatrix generation command.
-
-        Returns:
-            Command arguments as list.
-        """
-        return [
-            str(binary),
-            "-m",
-            str(model_path),
-            "-f",
-            str(calibration_file),
-            "-o",
-            str(output_path),
-            "--process-output",
-            "--output-frequency",
-            "10",
-            "--save-frequency",
-            "50",
-            "-t",
-            "8",
-            "-c",
-            "2048",
-            "-b",
-            "512",
-        ]
-
-    def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
-        """Execute imatrix generation command with real-time output.
-
-        Returns:
-            Path to generated imatrix file, or None if generation fails.
-        """
-        logger.info(f"Running: {' '.join(cmd)}")
-        logger.info("Starting imatrix generation... (progress will be shown)")
-
-        try:
-            process = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                universal_newlines=True,
-                bufsize=1,
-            )
-
-            self._stream_imatrix_output(process)
-
-            return_code = process.poll()
-            if return_code == 0:
-                return self._validate_imatrix_output(imatrix_path)
-
-        except KeyboardInterrupt:
-            logger.info("imatrix generation cancelled by user")
-            process.terminate()
-            return None
-        except Exception as e:
-            logger.error(f"imatrix generation failed with exception: {e}")
-            return None
-        else:
-            logger.error(f"imatrix generation failed with return code {return_code}")
-            return None
-
-    def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
-        """Stream imatrix generation output in real-time."""
-        while True:
-            if process.stdout is not None:
-                output = process.stdout.readline()
-            else:
-                break
-            if not output and process.poll() is not None:
-                break
-            if output:
-                line = output.strip()
-                if self._should_log_imatrix_line(line):
-                    logger.info(line)
-
-    def _should_log_imatrix_line(self, line: str) -> bool:
-        """Determine if imatrix output line should be logged.
-
-        Returns:
-            True if line should be logged, False otherwise.
-        """
-        keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
-        return any(keyword in line for keyword in keywords) or line.startswith("[")
-
-    def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
-        """Validate generated imatrix file.
-
-        Returns:
-            Path to imatrix if valid, None otherwise.
-        """
-        if imatrix_path.exists():
-            file_size = self.fs.get_file_size(imatrix_path)
-            logger.info(f"imatrix generation successful! ({file_size})")
-            return imatrix_path
-        logger.error("imatrix generation completed but file not found")
-        return None
-
-    def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
-        """Find llama-imatrix binary in common locations.
-
-        Searches for the imatrix binary in the current directory and
-        standard installation paths.
-
-        Returns:
-            Path to imatrix binary, or None if not found.
-        """
-        candidates = [
-            Path("./llama-imatrix"),
-            llama_env.quantise_binary.parent / "llama-imatrix",
-            Path("/usr/local/bin/llama-imatrix"),
-            Path("/usr/bin/llama-imatrix"),
-        ]
-
-        for candidate in candidates:
-            if candidate.exists() and candidate.is_file():
-                return candidate
-
+        logger.warning("No imatrix.dat file found - continuing without imatrix")
         return None
diff --git a/helpers/services/llama_python.py b/helpers/services/llama_python.py
new file mode 100644
index 0000000..157bbed
--- /dev/null
+++ b/helpers/services/llama_python.py
@@ -0,0 +1,756 @@
+"""Python API wrapper for llama-cpp-python quantisation operations.
+
+Provides high-level Python interfaces for model quantisation using llama-cpp-python
+bindings. Implements partial tensor-specific quantisation support through embedding
+and output tensor type configuration.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import gc
+import logging
+import os
+import signal
+import sys
+import traceback
+from typing import TYPE_CHECKING, Any, ClassVar, Never
+
+import psutil
+
+from helpers.logger import logger
+from helpers.services.gguf import GGUFConverter
+from helpers.utils.config_parser import ConfigParser
+from helpers.utils.tensor_mapping import TensorMapper
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from helpers.models.quantisation import QuantisationConfig
+
+# Import llama_cpp when needed
+try:
+    import llama_cpp
+    from llama_cpp import llama_model_quantize_params
+
+    LLAMA_CPP_AVAILABLE = True
+except ImportError:
+    LLAMA_CPP_AVAILABLE = False
+    logger.warning("llama-cpp-python not available - falling back to binary mode")
+
+
+class LlamaCppPythonAPI:
+    """Python API wrapper for llama.cpp quantisation operations.
+
+    Provides direct Python access to quantisation functionality using llama-cpp-python
+    bindings. Implements partial tensor-specific quantisation through token embedding
+    and output tensor type configuration, which provides differentiation between
+    Q4_K variants even without full per-layer tensor control.
+    """
+
+    # Mapping of custom variant prefixes to their base types
+    VARIANT_BASE_MAPPING: ClassVar[dict[str, str]] = {
+        "Q3_K_": "Q3_K_M",
+        "Q4_K_": "Q4_K_M",
+        "Q5_K_": "Q5_K_M",
+        "Q6_K_": "Q6_K",
+    }
+
+    @staticmethod
+    def is_available() -> bool:
+        """Check if llama-cpp-python is available for use.
+
+        Returns:
+            True if llama-cpp-python bindings are installed and functional.
+        """
+        return LLAMA_CPP_AVAILABLE
+
+    @staticmethod
+    def get_quantisation_type(config_name: str) -> int:
+        """Map configuration name to llama_cpp quantisation type constant.
+
+        Supports a wide range of quantisation types from Q2 to Q8, including
+        K-quants and legacy formats. Handles both simple formats (Q4_K_M, Q6_K)
+        and custom suffixed variants (Q4_K_M_L, Q5_K_M_XL) by mapping them to
+        their base types for llama-cpp-python compatibility.
+
+        Returns:
+            llama_cpp quantisation type constant for base quantisation.
+
+        Raises:
+            RuntimeError: If llama-cpp-python is not available.
+            ValueError: If the quantisation type is not supported.
+        """
+        if not LLAMA_CPP_AVAILABLE:
+            msg = "llama-cpp-python not available"
+            raise RuntimeError(msg)
+
+        # Normalise the config name to extract base type
+        # E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
+        # E.g., "Q4_K_M_XXL" -> "Q4_K_M"
+        config_upper = config_name.upper()
+
+        # Direct mapping for exact matches
+        type_mapping = {
+            # Q2 variants (not recommended but supported)
+            "Q2_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K,
+            "Q2_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K_S,
+            # Q3 K-quants
+            "Q3_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_S,
+            "Q3_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_M,
+            # Q4 K-quants (most common)
+            "Q4_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_S,
+            "Q4_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_M,
+            # Q5 K-quants
+            "Q5_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_S,
+            "Q5_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_M,
+            # Q6_K (single variant)
+            "Q6_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q6_K,
+            # Q8_0 (highest common quantisation)
+            "Q8_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q8_0,
+            # Legacy quantisation formats
+            "Q4_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_0,
+            "Q4_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_1,
+            "Q5_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_0,
+            "Q5_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_1,
+            # IQ (Integer Quantisation) variants - experimental
+            "IQ2_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XXS,
+            "IQ2_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XS,
+            "IQ2_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_S,
+            "IQ2_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_M,
+            "IQ3_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XXS,
+            "IQ3_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XS,
+            "IQ3_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_S,
+            "IQ3_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_M,
+            "IQ4_NL": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_NL,
+            "IQ4_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_XS,
+            # Higher precision formats
+            "F16": llama_cpp.LLAMA_FTYPE_MOSTLY_F16,
+            "BF16": llama_cpp.LLAMA_FTYPE_MOSTLY_BF16,
+        }
+
+        # Try direct lookup first
+        if config_upper in type_mapping:
+            return type_mapping[config_upper]
+
+        # Handle custom variants using base mapping
+        for prefix, base_type in LlamaCppPythonAPI.VARIANT_BASE_MAPPING.items():
+            if config_upper.startswith(prefix) and config_upper not in type_mapping:
+                return type_mapping[base_type]
+
+        # If not found, raise an informative error
+        supported = sorted(type_mapping.keys())
+        msg = (
+            f"Unsupported quantisation type: {config_name}\n"
+            f"Supported types: {', '.join(supported)}\n"
+            f"Custom variants like Q4_K_L, Q4_K_XL are also supported."
+        )
+        raise ValueError(msg)
+
+    @staticmethod
+    def get_tensor_type_value(type_name: str) -> int:
+        """Convert tensor type name to llama_cpp constant.
+
+        Maps string tensor type names to their corresponding llama_cpp integer
+        constants for tensor-specific overrides. Provides the foundation for
+        differentiated quantisation strategies across embedding and output layers.
+
+        Returns:
+            Integer value for the tensor type, or 0 if not found.
+        """
+        if not LLAMA_CPP_AVAILABLE:
+            return 0
+
+        # Build mapping with variant consolidation
+        # All Q3_K variants map to base Q3_K type, same for Q4_K and Q5_K
+        type_mapping = LlamaCppPythonAPI._build_tensor_type_mapping()
+        return type_mapping.get(type_name.upper(), 0)
+
+    @staticmethod
+    def _build_tensor_type_mapping() -> dict[str, int]:
+        """Build tensor type mapping with variant consolidation.
+
+        Returns:
+            Dictionary mapping type names to GGML constants.
+        """
+        if not LLAMA_CPP_AVAILABLE:
+            return {}
+
+        # Base mappings
+        return {
+            # Q2 variants
+            "Q2_K": llama_cpp.GGML_TYPE_Q2_K,
+            # Q3 variants - all map to base Q3_K
+            "Q3_K": llama_cpp.GGML_TYPE_Q3_K,
+            "Q3_K_S": llama_cpp.GGML_TYPE_Q3_K,
+            "Q3_K_M": llama_cpp.GGML_TYPE_Q3_K,
+            "Q3_K_L": llama_cpp.GGML_TYPE_Q3_K,
+            # Q4 variants
+            "Q4_0": llama_cpp.GGML_TYPE_Q4_0,
+            "Q4_1": llama_cpp.GGML_TYPE_Q4_1,
+            "Q4_K": llama_cpp.GGML_TYPE_Q4_K,
+            "Q4_K_S": llama_cpp.GGML_TYPE_Q4_K,
+            "Q4_K_M": llama_cpp.GGML_TYPE_Q4_K,
+            # Q5 variants
+            "Q5_0": llama_cpp.GGML_TYPE_Q5_0,
+            "Q5_1": llama_cpp.GGML_TYPE_Q5_1,
+            "Q5_K": llama_cpp.GGML_TYPE_Q5_K,
+            "Q5_K_S": llama_cpp.GGML_TYPE_Q5_K,
+            "Q5_K_M": llama_cpp.GGML_TYPE_Q5_K,
+            # Q6 variant
+            "Q6_K": llama_cpp.GGML_TYPE_Q6_K,
+            # Q8 variant
+            "Q8_0": llama_cpp.GGML_TYPE_Q8_0,
+            # Higher precision
+            "F16": llama_cpp.GGML_TYPE_F16,
+            "F32": llama_cpp.GGML_TYPE_F32,
+        }
+
+    def quantise_model_flexible(
+        self,
+        input_path: Path,
+        output_path: Path,
+        base_type: str,
+        embedding_type: str | None = None,
+        output_type: str | None = None,
+        imatrix_path: Path | None = None,
+    ) -> bool:
+        """Quantise model with flexible tensor type configuration.
+
+        Provides control over base quantisation type with optional overrides for
+        embeddings and output layers, which are the only tensor-specific controls
+        that work reliably with llama-cpp-python.
+
+        Args:
+            input_path: Path to input GGUF model.
+            output_path: Path for output quantised model.
+            base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
+            embedding_type: Override for token embeddings (None = use base).
+            output_type: Override for output/lm_head layers (None = use base).
+            imatrix_path: Optional importance matrix file.
+
+        Returns:
+            True if quantisation successful, False otherwise.
+
+        Examples:
+            # Q4_K_L: Q4_K_M base with Q8_0 embeddings
+            api.quantise_model_flexible(
+                input_path, output_path, "Q4_K_M",
+                embedding_type="Q8_0"
+            )
+
+            # Q3_K_L: Q3_K_M base with Q5_K output
+            api.quantise_model_flexible(
+                input_path, output_path, "Q3_K_M",
+                output_type="Q5_K"
+            )
+
+            # Q3_K_XL: Q3_K_M with both Q8_0 embeddings and Q5_K output
+            api.quantise_model_flexible(
+                input_path, output_path, "Q3_K_M",
+                embedding_type="Q8_0",
+                output_type="Q5_K"
+            )
+
+        Raises:
+            RuntimeError: If llama-cpp-python is not available.
+        """
+        if not LLAMA_CPP_AVAILABLE:
+            msg = "llama-cpp-python not available for quantisation"
+            raise RuntimeError(msg)
+
+        logger.info(f"🔄 Flexible quantisation: {base_type} base")
+        logger.info(f"📝 Input: {input_path}")
+        logger.info(f"📝 Output: {output_path}")
+
+        # Setup phase - create and configure parameters
+        params = self._create_params(base_type, imatrix_path)
+        self._apply_tensor_overrides(params, embedding_type, output_type)
+
+        # Execution phase - perform quantisation
+        try:
+            logger.debug("DEBUG: Starting flexible quantisation execution")
+            result = self._do_quantisation(input_path, output_path, params)
+            logger.debug(f"DEBUG: Flexible quantisation returned: {result}")
+
+        except Exception as e:
+            logger.error(f"❌ Flexible quantisation failed with exception: {e}")
+            logger.error("Flexible quantisation traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            return False
+        else:
+            if result == 0:
+                # Verify output file was created and is valid
+                if not output_path.exists():
+                    logger.error(
+                        f"❌ Quantisation claimed success but output does not exist: {output_path}"
+                    )
+                    return False
+
+                try:
+                    output_size = output_path.stat().st_size
+                    logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
+
+                    if output_size == 0:
+                        logger.error("❌ Output file is empty despite success code")
+                        return False
+                except Exception as e:
+                    logger.warning(f"⚠️ Could not check output file size: {e}")
+
+                logger.info(f"✅ Quantisation successful: {output_path.name}")
+                return True
+            logger.error(f"❌ Quantisation failed with code: {result}")
+            return False
+
+    def _create_params(
+        self, base_type: str, imatrix_path: Path | None
+    ) -> llama_model_quantize_params:
+        """Create quantisation parameters.
+
+        Returns:
+            Configured quantisation parameters.
+        """
+        params = llama_model_quantize_params()
+        params.ftype = self.get_quantisation_type(base_type)
+        params.nthread = 8
+        params.allow_requantize = True
+
+        if imatrix_path and imatrix_path.exists():
+            # Convert path to bytes and create c_char_p, then cast to c_void_p
+            imatrix_bytes = str(imatrix_path).encode("utf-8")
+            char_p = ctypes.c_char_p(imatrix_bytes)
+            params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
+            logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
+
+        return params
+
+    def _apply_tensor_overrides(
+        self,
+        params: llama_model_quantize_params,
+        embedding_type: str | None,
+        output_type: str | None,
+    ) -> None:
+        """Apply embedding and output tensor type overrides to params.
+
+        These are the only tensor-specific controls that work reliably
+        with llama-cpp-python.
+        """
+        # Apply embedding override if specified
+        if embedding_type:
+            params.token_embedding_type = self.get_tensor_type_value(embedding_type)
+            logger.info(f"⚙️ Token embedding type: {embedding_type}")
+
+        # Apply output override if specified
+        if output_type:
+            params.output_tensor_type = self.get_tensor_type_value(output_type)
+            params.quantize_output_tensor = True
+            logger.info(f"⚙️ Output tensor type: {output_type}")
+
+    def _do_quantisation(
+        self,
+        input_path: Path,
+        output_path: Path,
+        params: llama_model_quantize_params,
+    ) -> int:
+        """Perform the quantisation operation.
+
+        Returns:
+            Return code (0 for success).
+
+        Raises:
+            KeyboardInterrupt: If the user interrupts the quantisation process.
+            SystemExit: If the system exits during quantisation.
+        """
+        logger.debug("DEBUG: Calling llama_cpp.llama_model_quantize")
+        try:
+            # Flush any pending output before calling C library
+
+            sys.stdout.flush()
+            sys.stderr.flush()
+
+            # Temporarily redirect stderr to prevent terminal control issues
+            # Some GGUF models output control sequences that can break the terminal
+            old_stderr_fd = None
+            devnull_fd = None
+
+            try:
+                # Only redirect if not in debug mode to preserve error messages
+                if not logger.isEnabledFor(logging.DEBUG):
+                    old_stderr_fd = os.dup(2)  # Save current stderr
+                    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+                    os.dup2(devnull_fd, 2)  # Redirect stderr to /dev/null
+
+                # Call the quantization with proper exception handling
+                result = llama_cpp.llama_model_quantize(
+                    str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
+                )
+
+            finally:
+                # Restore stderr if we redirected it
+                if old_stderr_fd is not None:
+                    os.dup2(old_stderr_fd, 2)
+                    os.close(old_stderr_fd)
+                if devnull_fd is not None:
+                    os.close(devnull_fd)
+
+                # Flush output after the call
+                sys.stdout.flush()
+                sys.stderr.flush()
+        except KeyboardInterrupt:
+            logger.error("❌ Quantisation interrupted by user")
+            raise
+        except SystemExit as e:
+            logger.error(f"❌ System exit during quantisation: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"❌ llama_model_quantize call failed: {e}")
+            logger.error("llama_model_quantize call traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            raise
+        else:
+            logger.debug(f"DEBUG: llama_model_quantize completed with code: {result}")
+            return result
+
+    def quantise_model(
+        self,
+        input_path: Path,
+        output_path: Path,
+        config: QuantisationConfig,
+        imatrix_path: Path | None = None,
+    ) -> bool:
+        """Quantise model using Python API.
+
+        Performs quantisation using llama-cpp-python's direct API access with
+        support for embedding and output tensor type overrides. The L and XL
+        variants use a base type with specific overrides.
+
+        Returns:
+            True if quantisation successful, False otherwise.
+
+        Raises:
+            RuntimeError: If llama-cpp-python is not available.
+        """
+        if not LLAMA_CPP_AVAILABLE:
+            msg = "llama-cpp-python not available for quantisation"
+            raise RuntimeError(msg)
+
+        # Force cleanup before starting
+        gc.collect()
+
+        # Log initial resource state
+        mem_before = self._log_resource_state("before")
+
+        try:
+            # Validate input
+            if not self._validate_input_file(input_path):
+                return False
+            # Setup parameters
+            params = self._setup_quantisation_params(config, imatrix_path)
+            if params is None:
+                return False
+            # Execute quantisation
+            result = self._execute_quantisation(input_path, output_path, params)
+            # Verify and finalize
+            if result == 0:
+                return self._finalize_successful_quantisation(output_path, mem_before)
+
+            logger.error(f"❌ Quantisation failed with code: {result}")
+        except Exception as e:
+            logger.error(f"❌ Quantisation failed with exception: {e}")
+            logger.error("Full quantisation traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+        # Garbage collect and return false
+        gc.collect()
+        return False
+
+    def _log_resource_state(self, phase: str) -> float:
+        """Log current resource usage state.
+
+        Args:
+            phase: Description of current phase (e.g., "before", "after").
+
+        Returns:
+            Current memory usage in GB.
+        """
+        process = psutil.Process()
+        memory_gb = process.memory_info().rss / (1024**3)
+        logger.debug(f"DEBUG: Memory {phase} quantisation: {memory_gb:.2f} GB")
+        logger.debug(f"DEBUG: Open file descriptors: {len(process.open_files())}")
+        if phase == "before":
+            logger.debug(f"DEBUG: Process PID: {process.pid}")
+        return memory_gb
+
+    def _validate_input_file(self, input_path: Path) -> bool:
+        """Validate input file exists and is readable.
+
+        Args:
+            input_path: Path to input file.
+
+        Returns:
+            True if file is valid, False otherwise.
+        """
+        logger.debug(f"DEBUG: Starting quantisation of {input_path.name}")
+        logger.info(f"🔄 Quantising {input_path.name}...")
+        logger.debug(f"DEBUG: Input: {input_path}")
+
+        if not input_path.exists():
+            logger.error(f"❌ Input file does not exist: {input_path}")
+            return False
+
+        if not input_path.is_file():
+            logger.error(f"❌ Input path is not a file: {input_path}")
+            return False
+
+        try:
+            input_size = input_path.stat().st_size
+            logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
+            if input_size == 0:
+                logger.error("❌ Input file is empty")
+                return False
+        except Exception as e:
+            logger.warning(f"⚠️ Could not check input file size: {e}")
+
+        return True
+
+    def _setup_quantisation_params(
+        self,
+        config: QuantisationConfig,
+        imatrix_path: Path | None,
+    ) -> llama_model_quantize_params | None:
+        """Setup quantisation parameters.
+
+        Args:
+            config: Quantisation configuration.
+            imatrix_path: Optional path to importance matrix.
+
+        Returns:
+            Configured parameters or None if setup failed.
+        """
+        logger.debug("DEBUG: Setting up quantisation parameters")
+        params = llama_model_quantize_params()
+
+        # Set base quantisation type
+        try:
+            params.ftype = self.get_quantisation_type(config.base_type)
+            logger.debug(
+                f"DEBUG: Set ftype to {params.ftype} for {config.base_type} (config: {config.name})"
+            )
+        except Exception as e:
+            logger.error(f"❌ Failed to get quantisation type for {config.name}: {e}")
+            return None
+
+        # Configure basic parameters
+        params.nthread = 8
+        params.allow_requantize = True
+        logger.debug(
+            f"DEBUG: Set nthread={params.nthread}, allow_requantize={params.allow_requantize}"
+        )
+
+        # Add imatrix if available
+        if imatrix_path and imatrix_path.exists():
+            try:
+                # Convert path to bytes and create c_char_p, then cast to c_void_p
+                imatrix_bytes = str(imatrix_path).encode("utf-8")
+                char_p = ctypes.c_char_p(imatrix_bytes)
+                params.imatrix = ctypes.cast(char_p, ctypes.c_void_p)
+                logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
+                logger.debug(f"DEBUG: imatrix path set: {imatrix_path}")
+            except Exception as e:
+                logger.error(f"❌ Failed to set imatrix: {e}")
+                # Continue without imatrix
+
+        # Configure tensor-specific types
+        logger.debug("DEBUG: Configuring tensor-specific types")
+        try:
+            self._configure_tensor_types(params, config)
+            logger.debug("DEBUG: Tensor types configured successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to configure tensor types: {e}")
+            logger.error("Tensor type configuration traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            # Continue with default types
+
+        return params
+
+    def _execute_quantisation(
+        self,
+        input_path: Path,
+        output_path: Path,
+        params: llama_model_quantize_params,
+    ) -> int:
+        """Execute the actual quantisation with signal handling.
+
+        Args:
+            input_path: Path to input model.
+            output_path: Path for output model.
+            params: Configured quantisation parameters.
+
+        Returns:
+            Return code from quantisation (0 for success).
+        """
+        logger.debug("DEBUG: Starting llama_cpp.llama_model_quantize call")
+        logger.debug("DEBUG: About to call llama_model_quantize...")
+
+        # Setup signal handlers
+        old_handlers = self._setup_signal_handlers()
+
+        try:
+            result = llama_cpp.llama_model_quantize(
+                str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params
+            )
+            logger.debug(f"DEBUG: llama_model_quantize returned: {result}")
+        except Exception as e:
+            logger.error(f"❌ llama_model_quantize raised exception: {e}")
+            logger.error("llama_model_quantize traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            return -1
+        else:
+            return result
+        finally:
+            self._restore_signal_handlers(old_handlers)
+
+    def _setup_signal_handlers(self) -> tuple[Any, Any | None]:
+        """Setup signal handlers for debugging termination.
+
+        Returns:
+            Tuple of (old_sigterm, old_sigsegv) handlers.
+        """
+
+        def signal_debug_handler(signum: int, frame: object) -> Never:  # noqa: ARG001
+            logger.error(f"DEBUG: Received signal {signum} during quantisation!")
+            logger.error(f"DEBUG: Signal name: {signal.Signals(signum).name}")
+            msg = f"Signal {signum} received"
+            raise KeyboardInterrupt(msg)
+
+        old_sigterm = signal.signal(signal.SIGTERM, signal_debug_handler)
+        old_sigsegv = (
+            signal.signal(signal.SIGSEGV, signal_debug_handler)
+            if hasattr(signal, "SIGSEGV")
+            else None
+        )
+        return old_sigterm, old_sigsegv
+
+    def _restore_signal_handlers(self, handlers: tuple[Any, Any | None]) -> None:
+        """Restore original signal handlers.
+
+        Args:
+            handlers: Tuple of (old_sigterm, old_sigsegv) handlers.
+        """
+        old_sigterm, old_sigsegv = handlers
+        signal.signal(signal.SIGTERM, old_sigterm)
+        if old_sigsegv is not None:
+            signal.signal(signal.SIGSEGV, old_sigsegv)
+
+    def _finalize_successful_quantisation(
+        self,
+        output_path: Path,
+        mem_before: float,
+    ) -> bool:
+        """Finalize successful quantisation and verify output.
+
+        Args:
+            output_path: Path to output file.
+            mem_before: Memory usage before quantisation in GB.
+
+        Returns:
+            True if output is valid, False otherwise.
+        """
+        logger.debug("DEBUG: Quantisation returned success code")
+
+        # Verify output exists
+        if not output_path.exists():
+            logger.error(
+                f"❌ Quantisation claimed success but output does not exist: {output_path}"
+            )
+            return False
+
+        # Verify output size
+        output_size = output_path.stat().st_size
+        logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB")
+
+        if output_size == 0:
+            logger.error("❌ Output file is empty despite success code")
+            return False
+
+        logger.info(f"✅ Quantisation successful: {output_path.name}")
+
+        # Force cleanup and log final state
+        gc.collect()
+        mem_after = self._log_resource_state("after")
+        logger.debug(f"DEBUG: Memory delta: {mem_after - mem_before:+.2f} GB")
+
+        return True
+
+    def _configure_tensor_types(
+        self, params: llama_model_quantize_params, config: QuantisationConfig
+    ) -> None:
+        """Configure tensor-specific quantisation types.
+
+        Sets embedding and output tensor type overrides based on config.
+        These are the only tensor-specific controls that work reliably
+        with llama-cpp-python.
+        """
+        logger.debug(f"DEBUG: _configure_tensor_types called for {config.name}")
+
+        # Apply embedding override if specified
+        if config.embedding_type:
+            params.token_embedding_type = self.get_tensor_type_value(config.embedding_type)
+            logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
+
+        # Apply output override if specified
+        if config.output_type:
+            params.output_tensor_type = self.get_tensor_type_value(config.output_type)
+            params.quantize_output_tensor = True
+            logger.info(f"⚙️ Output tensor type: {config.output_type}")
+
+    def convert_hf_to_gguf(
+        self, input_dir: Path, output_path: Path, output_type: str = "f16"
+    ) -> bool:
+        """Convert HuggingFace model to GGUF format using native Python converter.
+
+        Uses our GGUFConverter for SafeTensors models, providing full Python-based
+        conversion without external dependencies.
+
+        Returns:
+            True if conversion successful, False otherwise.
+        """
+        logger.info(f"🔄 Converting {input_dir.name} to GGUF format...")
+        logger.info(f"📝 Input: {input_dir}")
+        logger.info(f"📝 Output: {output_path}")
+        logger.info(f"📝 Type: {output_type}")
+
+        # Check for SafeTensors files
+        safetensor_files = list(input_dir.glob("*.safetensors"))
+        if not safetensor_files:
+            logger.warning("⚠️ No SafeTensors files found in model directory")
+            return False
+
+        try:
+            # Load model configuration
+            config_parser = ConfigParser()
+            model_config = config_parser.load_model_config(input_dir)
+
+            # Get architecture mapping
+            arch_name = model_config.architectures[0] if model_config.architectures else "llama"
+            arch = config_parser.get_architecture_mapping(arch_name)
+
+            if arch != arch_name:
+                logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
+
+            # Convert using GGUFConverter
+            tensor_mapper = TensorMapper()
+            success = GGUFConverter.convert_safetensors(
+                input_dir, output_path, model_config, arch, tensor_mapper
+            )
+        except Exception as e:
+            logger.error(f"❌ Conversion failed with exception: {e}")
+            return False
+        else:
+            if success:
+                logger.info("✅ Native Python conversion successful")
+            return success
diff --git a/helpers/services/orchestrator.py b/helpers/services/orchestrator.py
index 353d56e..2aeb43c 100644
--- a/helpers/services/orchestrator.py
+++ b/helpers/services/orchestrator.py
@@ -7,12 +7,22 @@ status tracking, and cleanup operations for efficient resource utilisation.
 
 from __future__ import annotations
 
-from concurrent.futures import Future, ThreadPoolExecutor
+import gc
+import signal
+import sys
+import traceback
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING
 
-from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
+import psutil
+
+from helpers.config.quantisation_configs import (
+    DEFAULT_QUANTISATION_TYPES,
+    QUANTISATION_CONFIGS,
+    SUPPORTED_QUANTISATION_TYPES,
+)
 from helpers.logger import logger
 from helpers.models.quantisation import (
     ModelSource,
@@ -21,10 +31,13 @@ from helpers.models.quantisation import (
     QuantisationType,
 )
 from helpers.services.huggingface import ReadmeGenerator
-from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
+from helpers.services.llama_cpp import IMatrixManager
 from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
 from helpers.utils.tensor_mapping import URLParser
 
+if TYPE_CHECKING:
+    from types import FrameType
+
 
 @dataclass(slots=True)
 class QuantisationOrchestrator:
@@ -36,73 +49,134 @@ class QuantisationOrchestrator:
 
     work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
     use_imatrix: bool = True
-    imatrix_base: str = "Q4_K_M"
     no_upload: bool = False
+    custom_profiles: list[str] | None = None
 
     # Service dependencies with factory defaults
     url_parser: URLParser = field(default_factory=URLParser)
     quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
-    imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
+    imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
     readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
     uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
 
     # Computed properties
     models_dir: Path = field(init=False)
-    environment_manager: EnvironmentManager = field(init=False)
     model_manager: ModelManager = field(init=False)
 
     def __post_init__(self) -> None:
         """Initialise computed properties after dataclass construction."""
         self.models_dir = self.work_dir / "models"
-        self.environment_manager = EnvironmentManager(self.work_dir)
-        self.model_manager = ModelManager(self.models_dir, self.environment_manager)
+        self.model_manager = ModelManager(self.models_dir)
+
+        # Set up signal handlers for graceful exit tracking
+        self._setup_signal_handlers()
+
+    def _setup_signal_handlers(self) -> None:
+        """Set up signal handlers to catch unexpected exits."""
+
+        def signal_handler(signum: int, frame: FrameType | None) -> None:
+            logger.error(f"❌ Received signal {signum} ({signal.Signals(signum).name})")
+            logger.error("Stack trace at signal:")
+            if frame:
+                for line in traceback.format_stack(frame):
+                    logger.error(f"  {line.strip()}")
+            logger.error("Exiting due to signal")
+            sys.exit(1)
+
+        # Handle common termination signals
+        for sig in [signal.SIGINT, signal.SIGTERM]:
+            signal.signal(sig, signal_handler)
+
+    def get_quantisation_types(self) -> list[QuantisationType]:
+        """Get the quantisation types to use for this run.
+
+        Returns:
+            List of QuantisationType enums to process.
+        """
+        if self.custom_profiles:
+            # Parse custom profiles from strings to QuantisationType
+            result = []
+            for profile_str in self.custom_profiles:
+                try:
+                    profile = QuantisationType(profile_str.upper())
+                    if profile in SUPPORTED_QUANTISATION_TYPES:
+                        result.append(profile)
+                    else:
+                        logger.warning(f"Profile {profile_str} is not supported, skipping")
+                except ValueError:
+                    logger.warning(f"Invalid profile {profile_str}, skipping")
+            return result or DEFAULT_QUANTISATION_TYPES
+        return DEFAULT_QUANTISATION_TYPES
 
     def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
         """Main quantisation workflow orchestrating model processing from URL to upload.
 
         Returns:
             dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
+
+        Raises:
+            KeyboardInterrupt: If the user interrupts the quantisation process.
         """
         logger.info("Starting Bartowski quantisation process...")
+        logger.debug(f"DEBUG: Input URL: {url}")
+        logger.debug(f"DEBUG: Working directory: {self.work_dir}")
+        logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}")
+        logger.debug(f"DEBUG: No upload: {self.no_upload}")
+        logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}")
 
-        # Setup and preparation
-        model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
-            self._setup_environment(url)
-        )
+        try:
+            # Setup and preparation
+            logger.debug("DEBUG: Starting environment setup...")
+            model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url)
+            logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}")
 
-        # Create initial repository
-        self._create_initial_repository(model_source, output_repo)
+            # Create initial repository
+            logger.debug("DEBUG: Creating initial repository...")
+            self._create_initial_repository(model_source, output_repo)
+            logger.debug("DEBUG: Initial repository created")
 
-        # Execute all quantisations
-        results = self._execute_quantisations(
-            model_source, llama_env, f16_model_path, imatrix_path, output_repo
-        )
+            # Execute all quantisations
+            logger.debug("DEBUG: Starting quantisation execution...")
+            results = self._execute_quantisations(
+                model_source, f16_model_path, imatrix_path, output_repo
+            )
+            logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items")
 
-        # Cleanup
-        self._cleanup_files(f16_model_path, model_source)
+            # Cleanup
+            logger.debug("DEBUG: Starting cleanup...")
+            self._cleanup_files(f16_model_path, model_source)
+            logger.debug("DEBUG: Cleanup complete")
 
-        self._print_completion_summary(model_source, results, output_repo)
-        return results
+            self._print_completion_summary(model_source, results, output_repo)
+        except KeyboardInterrupt:
+            logger.error("❌ Process interrupted by user (Ctrl+C)")
+            raise
+        except Exception as e:
+            logger.error(f"❌ Critical error in quantisation workflow: {e}")
+            logger.error("Full traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            raise
+        else:
+            return results
 
-    def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
+    def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]:
         """Setup environment and prepare model for quantisation.
 
         Returns:
-            Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
+            Tuple of (model_source, f16_model_path, imatrix_path, output_repo).
         """
         model_source = self.url_parser.parse(url)
         self._print_model_info(model_source)
 
         self.models_dir.mkdir(parents=True, exist_ok=True)
-        llama_env = self.environment_manager.setup()
-
-        f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
+        f16_model_path = self.model_manager.prepare_model(model_source)
 
         imatrix_path = None
         if self.use_imatrix:
-            logger.info("Generating importance matrix (imatrix)...")
-            imatrix_path = self.imatrix_generator.generate_imatrix(
-                f16_model_path, llama_env, self.models_dir / model_source.model_name
+            logger.info("Checking for importance matrix (imatrix)...")
+            imatrix_path = self.imatrix_manager.find_imatrix(
+                self.models_dir / model_source.model_name
             )
 
         output_repo = (
@@ -110,14 +184,15 @@ class QuantisationOrchestrator:
             f"{model_source.original_author}-{model_source.model_name}-GGUF"
         )
 
-        return model_source, llama_env, f16_model_path, imatrix_path, output_repo
+        return model_source, f16_model_path, imatrix_path, output_repo
 
     def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
         """Create initial repository with planned quantisations."""
         logger.info("Creating initial README with planned quantisations...")
+        quantisation_types = self.get_quantisation_types()
         planned_results = {
             qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
-            for qt in SUPPORTED_QUANTISATION_TYPES
+            for qt in quantisation_types
         }
         readme_path = self.readme_generator.generate(
             model_source, planned_results, self.models_dir, output_repo
@@ -132,7 +207,6 @@ class QuantisationOrchestrator:
     def _execute_quantisations(
         self,
         model_source: ModelSource,
-        llama_env: Any,
         f16_model_path: Path,
         imatrix_path: Path | None,
         output_repo: str,
@@ -143,23 +217,56 @@ class QuantisationOrchestrator:
             dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
         """
         results: dict[QuantisationType, QuantisationResult] = {}
-        upload_futures: list[Future[None]] = []
 
-        with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
-            for quant_type in SUPPORTED_QUANTISATION_TYPES:
-                result = self._process_single_quantisation(
-                    quant_type,
-                    model_source,
-                    llama_env,
-                    f16_model_path,
-                    imatrix_path,
-                    output_repo,
-                    results,
-                    upload_executor,
-                    upload_futures,
+        quantisation_types = self.get_quantisation_types()
+        types_list = [qt.value for qt in quantisation_types]
+        logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
+
+        # Process with parallel uploads - quantise sequentially but upload in background
+        upload_futures = []
+        with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
+            for i, quant_type in enumerate(quantisation_types, 1):
+                logger.info(
+                    f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
                 )
-                results[quant_type] = result
+                logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}")
+                logger.debug(f"DEBUG: Current type: {quant_type.value}")
+                logger.debug(f"DEBUG: Results so far: {len(results)} completed")
 
+                try:
+                    result = self._process_single_quantisation(
+                        quant_type,
+                        model_source,
+                        f16_model_path,
+                        imatrix_path,
+                        output_repo,
+                        results,
+                        upload_executor,
+                        upload_futures,
+                    )
+                    results[quant_type] = result
+                    logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
+
+                    # Force cleanup between quantisations
+                    gc.collect()
+                    logger.debug("DEBUG: Garbage collection completed")
+
+                except Exception as e:
+                    logger.error(f"❌ Critical error processing {quant_type.value}: {e}")
+                    logger.error("Exception traceback:")
+                    for line in traceback.format_exc().splitlines():
+                        logger.error(f"  {line}")
+                    results[quant_type] = QuantisationResult(
+                        quantisation_type=quant_type,
+                        success=False,
+                        status="failed",
+                        error_message=str(e),
+                    )
+
+                    # Force cleanup after error
+                    gc.collect()
+
+            # Wait for all uploads to complete before returning
             self._wait_for_uploads(upload_futures)
 
         return results
@@ -168,7 +275,6 @@ class QuantisationOrchestrator:
         self,
         quant_type: QuantisationType,
         model_source: ModelSource,
-        llama_env: Any,
         f16_model_path: Path,
         imatrix_path: Path | None,
         output_repo: str,
@@ -183,26 +289,33 @@ class QuantisationOrchestrator:
         """
         try:
             logger.info(f"Starting {quant_type.value} quantisation...")
+            logger.debug(f"DEBUG: Getting config for {quant_type.value}")
             config = QUANTISATION_CONFIGS[quant_type]
+            logger.debug(f"DEBUG: Config loaded: {config.name}")
 
             # Update status to processing
+            logger.debug("DEBUG: Creating initial quantisation result")
             result = QuantisationResult(quantisation_type=quant_type, success=False)
             result.status = "processing"
             results[quant_type] = result
 
+            logger.debug("DEBUG: Updating README status")
             self._update_readme_status(model_source, results, output_repo)
 
             # Perform quantisation
+            logger.debug("DEBUG: Creating quantisation context")
             context = QuantisationContext(
                 f16_model_path=f16_model_path,
                 model_source=model_source,
                 config=config,
-                llama_env=llama_env,
                 models_dir=self.models_dir,
                 imatrix_path=imatrix_path,
-                base_quant=self.imatrix_base,
             )
+            logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
+            logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
+            logger.debug("DEBUG: Calling quantisation engine...")
             result = self.quantisation_engine.quantise(context)
+            logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
 
             self._handle_quantisation_result(
                 result,
@@ -220,6 +333,108 @@ class QuantisationOrchestrator:
         else:
             return result
 
+    def _process_single_quantisation_sequential(
+        self,
+        quant_type: QuantisationType,
+        model_source: ModelSource,
+        f16_model_path: Path,
+        imatrix_path: Path | None,
+        output_repo: str,
+        results: dict[QuantisationType, QuantisationResult],
+    ) -> QuantisationResult:
+        """Process a single quantisation type sequentially with immediate upload.
+
+        Returns:
+            QuantisationResult: Result of the quantisation attempt.
+        """
+        # Force cleanup before starting new quantisation
+        gc.collect()
+
+        # Log system state before quantisation
+        process = psutil.Process()
+        logger.debug(f"DEBUG: === System state before {quant_type.value} ===")
+        logger.debug(f"DEBUG: Process alive: {process.is_running()}")
+        logger.debug(f"DEBUG: PID: {process.pid}")
+        logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB")
+        logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%")
+        logger.debug(f"DEBUG: Threads: {process.num_threads()}")
+        logger.debug(f"DEBUG: Open files: {len(process.open_files())}")
+
+        try:
+            logger.info(f"Starting {quant_type.value} quantisation...")
+            logger.debug(f"DEBUG: Getting config for {quant_type.value}")
+            config = QUANTISATION_CONFIGS[quant_type]
+            logger.debug(f"DEBUG: Config loaded: {config.name}")
+
+            # Update status to processing
+            logger.debug("DEBUG: Creating initial quantisation result")
+            result = QuantisationResult(quantisation_type=quant_type, success=False)
+            result.status = "processing"
+            results[quant_type] = result
+
+            logger.debug("DEBUG: Updating README status")
+            self._update_readme_status(model_source, results, output_repo)
+
+            # Perform quantisation
+            logger.debug("DEBUG: Creating quantisation context")
+            context = QuantisationContext(
+                f16_model_path=f16_model_path,
+                model_source=model_source,
+                config=config,
+                models_dir=self.models_dir,
+                imatrix_path=imatrix_path,
+            )
+            logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}")
+            logger.debug(f"DEBUG: imatrix path: {imatrix_path}")
+            logger.debug("DEBUG: Calling quantisation engine...")
+            result = self.quantisation_engine.quantise(context)
+            logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}")
+
+            if result.success and result.file_path:
+                # Upload immediately (if not in no-upload mode)
+                if not self.no_upload:
+                    logger.info(f"Uploading {quant_type.value}...")
+                    try:
+                        self.uploader.upload_model_file(output_repo, result.file_path)
+                        logger.info(f"Upload of {quant_type.value} completed successfully")
+
+                        # Clean up file after successful upload
+                        logger.info(f"Removing {result.file_path.name} to save disk space...")
+                        result.file_path.unlink()
+
+                        result.status = "completed"
+                        self._update_readme_status(model_source, results, output_repo)
+                    except Exception as upload_error:
+                        logger.error(f"Failed to upload {quant_type.value}: {upload_error}")
+                        result.status = "failed"
+                        result.error_message = str(upload_error)
+                        self._update_readme_status(model_source, results, output_repo)
+                        # Keep file if upload failed
+                else:
+                    # No upload mode - just mark as completed
+                    result.status = "completed"
+                    logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)")
+            else:
+                result.status = "failed"
+                self._update_readme_status(model_source, results, output_repo)
+        except Exception as e:
+            logger.error(f"Error processing {quant_type.value}: {e}")
+            result = QuantisationResult(quantisation_type=quant_type, success=False)
+            result.status = "failed"
+            result.error_message = str(e)
+
+            try:
+                self._update_readme_status(model_source, results, output_repo)
+            except Exception as readme_error:
+                logger.error(f"Failed to update README after error: {readme_error}")
+            # Force cleanup after error
+            gc.collect()
+            return result
+        else:
+            # Force cleanup after quantisation
+            gc.collect()
+            return result
+
     def _handle_quantisation_result(
         self,
         result: QuantisationResult,
@@ -328,8 +543,9 @@ class QuantisationOrchestrator:
     ) -> None:
         """Upload file and clean up (runs in background thread)."""
         try:
-            logger.info(f"[PARALLEL] Uploading {quant_type}...")
+            logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})")
             self.uploader.upload_model_file(output_repo, file_path)
+            logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully")
 
             logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
             file_path.unlink()
@@ -346,11 +562,16 @@ class QuantisationOrchestrator:
             results[quant_type].status = "failed"
             results[quant_type].error_message = str(e)
 
-            updated_readme_path = self.readme_generator.generate(
-                model_source, results, self.models_dir, output_repo
-            )
-            self.uploader.upload_readme(output_repo, updated_readme_path)
-            raise
+            try:
+                updated_readme_path = self.readme_generator.generate(
+                    model_source, results, self.models_dir, output_repo
+                )
+                self.uploader.upload_readme(output_repo, updated_readme_path)
+            except Exception as readme_error:
+                logger.error(
+                    f"[PARALLEL] Failed to update README after upload error: {readme_error}"
+                )
+            # Don't re-raise - let other uploads continue
 
     def _print_model_info(self, model_source: ModelSource) -> None:
         """Print model information."""
diff --git a/helpers/services/quantisation.py b/helpers/services/quantisation.py
index 0ce3c22..0023d22 100644
--- a/helpers/services/quantisation.py
+++ b/helpers/services/quantisation.py
@@ -9,7 +9,9 @@ from __future__ import annotations
 
 import shutil
 import subprocess
-from typing import TYPE_CHECKING
+import tempfile
+import traceback
+from pathlib import Path
 
 from helpers.logger import logger
 from helpers.models.quantisation import (
@@ -19,12 +21,10 @@ from helpers.models.quantisation import (
     QuantisationType,
 )
 from helpers.services.filesystem import FilesystemService
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from helpers.models.quantisation import LlamaCppEnvironment
-    from helpers.services.llama_cpp import EnvironmentManager
+from helpers.services.gguf import GGUFConverter
+from helpers.services.llama_python import LlamaCppPythonAPI
+from helpers.utils.config_parser import ConfigParser
+from helpers.utils.tensor_mapping import TensorMapper
 
 
 class QuantisationEngine:
@@ -32,145 +32,88 @@ class QuantisationEngine:
 
     Provides flexible quantisation execution supporting multiple tensor
     precision configurations, importance matrices, and fallback strategies.
-    Encapsulates llama-quantize binary interactions with real-time output.
+    Uses llama-cpp-python API for direct quantisation without subprocess overhead.
     """
 
     def __init__(self) -> None:
         """Initialise quantisation engine."""
         self.fs = FilesystemService()
+        self.python_api = LlamaCppPythonAPI()
 
     def quantise(self, context: QuantisationContext) -> QuantisationResult:
         """Perform quantisation using the specified configuration.
 
-        Executes quantisation with primary and fallback methods, handling
-        tensor-specific precision overrides and importance matrix guidance.
+        Executes quantisation using Python API. Since llama-cpp-python is a
+        required dependency, we can rely on it being available.
 
         Returns:
             QuantisationResult with success status and file information.
         """
+        logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
         logger.info(
             f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
         )
 
         output_path = context.get_output_path()
+        logger.debug(f"DEBUG: Output path: {output_path}")
 
-        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
-        logger.info(f"📝 Source: {context.f16_model_path}")
-        logger.info(f"📝 Target: {output_path}")
-
-        # Try primary method
-        if self._try_quantisation_method(
-            context, output_path, context.config.tensor_types, "method 1"
-        ):
-            return self._create_success_result(context.config.name, output_path, "method 1")
-
-        # Try fallback methods
-        for i, fallback_method in enumerate(context.config.fallback_methods, 2):
-            method_name = f"method {i}"
-            if self._try_quantisation_method(context, output_path, fallback_method, method_name):
-                return self._create_success_result(context.config.name, output_path, method_name)
-
-        logger.error("All %s quantisation methods failed", context.config.name)
-        return QuantisationResult(
-            quantisation_type=QuantisationType(context.config.name),
-            success=False,
-            error_message="All quantisation methods failed",
-        )
-
-    def _try_quantisation_method(
-        self,
-        context: QuantisationContext,
-        output_path: Path,
-        tensor_config: dict[str, str],
-        method_name: str,
-    ) -> bool:
-        """Try a specific quantisation method with real-time output.
-
-        Builds and executes llama-quantize command with appropriate parameters,
-        streaming output for progress monitoring.
-
-        Returns:
-            True if quantisation successful, False otherwise.
-        """
-        logger.info(f"🔍 Trying {method_name}...")
-
-        cmd = self._build_quantisation_command(context, output_path, tensor_config)
-        return self._execute_quantisation_command(cmd, method_name)
-
-    def _build_quantisation_command(
-        self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
-    ) -> list[str]:
-        """Build quantisation command with all required parameters.
-
-        Returns:
-            List of command arguments.
-        """
-        cmd = [str(context.llama_env.quantise_binary)]
-
-        # Add imatrix if available
-        if context.imatrix_path and context.imatrix_path.exists():
-            cmd.extend(["--imatrix", str(context.imatrix_path)])
-            logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
-
-        # Add tensor type arguments
-        self._add_tensor_type_arguments(cmd, tensor_config)
-
-        cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
-        return cmd
-
-    def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
-        """Add tensor type arguments to command."""
-        if not tensor_config:
-            return
-
-        for tensor_name, quant_type in tensor_config.items():
-            if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
-                cmd.extend([f"--{tensor_name}", quant_type])
-            else:
-                cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
-
-    def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
-        """Execute quantisation command with real-time output.
-
-        Returns:
-            True if quantisation successful, False otherwise.
-        """
-        logger.info(f"💻 Running: {' '.join(cmd)}")
-        logger.info("⏳ Quantisation in progress... (this may take several minutes)")
-
-        try:
-            process = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                universal_newlines=True,
-                bufsize=1,
+        # Check input file exists and is readable
+        if not context.f16_model_path.exists():
+            error_msg = f"Input model file does not exist: {context.f16_model_path}"
+            logger.error(f"❌ {error_msg}")
+            return QuantisationResult(
+                quantisation_type=QuantisationType(context.config.name),
+                success=False,
+                error_message=error_msg,
             )
 
-            self._stream_quantisation_output(process)
-
-            return_code = process.poll()
-            if return_code == 0:
-                logger.info(f"✅ {method_name} quantisation successful!")
-                return True
+        # Check if we have enough disk space (rough estimate)
+        try:
+            input_size = context.f16_model_path.stat().st_size
+            logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
+            # This is a rough check - actual available space calculation is more complex
+            logger.debug(f"DEBUG: Output directory: {output_path.parent}")
         except Exception as e:
-            logger.info(f"❌ {method_name} failed with exception: {e}")
-            return False
-        else:
-            logger.info(f"❌ {method_name} failed with return code {return_code}")
-            return False
+            logger.warning(f"⚠️ Could not check disk space: {e}")
 
-    def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
-        """Stream quantisation output in real-time."""
-        while True:
-            if process.stdout is not None:
-                output = process.stdout.readline()
-            else:
-                break
-            if not output and process.poll() is not None:
-                break
-            if output:
-                logger.info(f"📊 {output.strip()}")
+        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
+        logger.debug(f"DEBUG: Source: {context.f16_model_path}")
+        logger.debug(f"DEBUG: Target: {output_path}")
+        logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
+
+        try:
+            # Use Python API for quantisation
+            logger.info("🐍 Using Python API for quantisation...")
+            logger.debug("DEBUG: Calling python_api.quantise_model...")
+
+            success = self.python_api.quantise_model(
+                context.f16_model_path, output_path, context.config, context.imatrix_path
+            )
+
+            logger.debug(f"DEBUG: Python API returned: {success}")
+
+            if success:
+                logger.debug("DEBUG: Quantisation successful, creating success result")
+                return self._create_success_result(context.config.name, output_path, "Python API")
+
+            logger.error(f"❌ {context.config.name} quantisation failed")
+            return QuantisationResult(
+                quantisation_type=QuantisationType(context.config.name),
+                success=False,
+                error_message="Quantisation failed via Python API",
+            )
+
+        except Exception as e:
+            logger.error(f"❌ Exception during {context.config.name} quantisation: {e}")
+            logger.error("Exception traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+
+            return QuantisationResult(
+                quantisation_type=QuantisationType(context.config.name),
+                success=False,
+                error_message=f"Exception during quantisation: {e!s}",
+            )
 
     def _create_success_result(
         self, quant_type: str, output_path: Path, method_used: str
@@ -197,17 +140,15 @@ class ModelManager:
     providing unified interface for model acquisition and preparation.
     """
 
-    def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
-        """Initialise model manager with storage and environment configuration.
+    def __init__(self, models_dir: Path) -> None:
+        """Initialise model manager with storage configuration.
 
-        Sets up model storage directory and links to environment manager for
-        conversion script access and llama.cpp tool discovery.
+        Sets up model storage directory for model downloads and conversions.
         """
         self.models_dir = models_dir
-        self.environment_manager = environment_manager
         self.fs = FilesystemService()
 
-    def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
+    def prepare_model(self, model_source: ModelSource) -> Path:
         """Prepare model for quantisation and return F16 model path.
 
         Handles both GGUF repository downloads and regular HuggingFace model
@@ -220,7 +161,7 @@ class ModelManager:
 
         if model_source.is_gguf_repo:
             return self._handle_gguf_repo(model_source, model_dir)
-        return self._handle_regular_repo(model_source, model_dir, llama_env)
+        return self._handle_regular_repo(model_source, model_dir)
 
     def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
         """Handle GGUF repository download with pattern matching.
@@ -275,7 +216,6 @@ class ModelManager:
         return self._handle_regular_repo(
             ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
             model_dir,
-            None,
         )
 
     def _download_gguf_with_patterns(
@@ -308,7 +248,10 @@ class ModelManager:
             temp_dir.mkdir(exist_ok=True)
 
             try:
-                subprocess.run(
+                logger.debug(
+                    f"DEBUG: Running huggingface-cli download for pattern {search_pattern}"
+                )
+                result = subprocess.run(
                     [
                         "timeout",
                         "300",
@@ -322,6 +265,10 @@ class ModelManager:
                     ],
                     check=True,
                     capture_output=True,
+                    text=True,
+                )
+                logger.debug(
+                    f"DEBUG: Download command completed with return code {result.returncode}"
                 )
 
                 # Find downloaded GGUF files
@@ -336,9 +283,22 @@ class ModelManager:
                     shutil.rmtree(temp_dir)
                     return final_path
 
-            except subprocess.CalledProcessError:
+            except subprocess.CalledProcessError as e:
+                logger.debug(
+                    f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}"
+                )
+                if e.stderr:
+                    logger.debug(f"DEBUG: stderr: {e.stderr}")
+                if e.stdout:
+                    logger.debug(f"DEBUG: stdout: {e.stdout}")
                 logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
                 continue
+            except Exception as e:
+                logger.error(f"❌ Unexpected error during download: {e}")
+                logger.error("Exception traceback:")
+                for line in traceback.format_exc().splitlines():
+                    logger.error(f"  {line}")
+                continue
             finally:
                 if temp_dir.exists():
                     shutil.rmtree(temp_dir, ignore_errors=True)
@@ -349,58 +309,123 @@ class ModelManager:
         self,
         model_source: ModelSource,
         model_dir: Path,
-        llama_env: LlamaCppEnvironment | None,
     ) -> Path:
         """Handle regular HuggingFace repository conversion.
 
         Downloads full model repository and converts to F16 GGUF format
-        using llama.cpp conversion scripts.
+        using our native Python-based GGUFConverter for SafeTensors models.
 
         Returns:
             Path to converted F16 GGUF model.
         """
         logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
 
+        # Download model if needed
         if not model_dir.exists():
-            subprocess.run(
+            self._download_repository(model_source.source_model, model_dir)
+        else:
+            logger.info("✅ Model already downloaded")
+
+        # Convert to GGUF
+        return self._convert_to_gguf(model_source, model_dir)
+
+    def _download_repository(self, source_model: str, model_dir: Path) -> None:
+        """Download HuggingFace repository.
+
+        Args:
+            source_model: HuggingFace model identifier.
+            model_dir: Local directory for download.
+
+        Raises:
+            RuntimeError: If download fails.
+        """
+        try:
+            logger.debug(f"DEBUG: Downloading full repository: {source_model}")
+            result = subprocess.run(
                 [
                     "huggingface-cli",
                     "download",
-                    model_source.source_model,
+                    source_model,
                     "--local-dir",
                     str(model_dir),
                 ],
                 check=True,
+                capture_output=True,
+                text=True,
             )
-        else:
-            logger.info("✅ Model already downloaded")
+            logger.debug(
+                f"DEBUG: Repository download completed with return code {result.returncode}"
+            )
+        except subprocess.CalledProcessError as e:
+            logger.error(f"❌ Failed to download repository {source_model}")
+            logger.error(f"Return code: {e.returncode}")
+            if e.stderr:
+                logger.error(f"stderr: {e.stderr}")
+            if e.stdout:
+                logger.error(f"stdout: {e.stdout}")
+            msg = f"Repository download failed: {e}"
+            raise RuntimeError(msg) from e
+        except Exception as e:
+            logger.error(f"❌ Unexpected error during repository download: {e}")
+            logger.error("Exception traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            raise
 
+    def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path:
+        """Convert model to GGUF F16 format.
+
+        Args:
+            model_source: Model source information.
+            model_dir: Directory containing model files.
+
+        Returns:
+            Path to F16 GGUF model.
+
+        Raises:
+            RuntimeError: If conversion fails.
+        """
         logger.info("🔄 Converting to GGUF F16 format...")
         f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
 
-        if not f16_model.exists():
-            if not llama_env:
-                llama_env = self.environment_manager.setup()
-
-            # Ensure conversion script is available
-            if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
-                logger.info("Getting conversion script from llama.cpp repository...")
-                llama_env = self.environment_manager.setup_repository()
-
-            subprocess.run(
-                [
-                    *llama_env.convert_script.split(),
-                    str(model_dir),
-                    "--outtype",
-                    "f16",
-                    "--outfile",
-                    str(f16_model),
-                ],
-                check=True,
-            )
-        else:
+        if f16_model.exists():
             logger.info("✅ F16 model already exists")
+            return f16_model
 
+        # Check for SafeTensors files
+        safetensor_files = list(model_dir.glob("*.safetensors"))
+        if not safetensor_files:
+            logger.error("❌ Model format not supported")
+            logger.info("💡 This tool supports GGUF and SafeTensors formats")
+            msg = "Model must be in GGUF or SafeTensors format"
+            raise RuntimeError(msg)
+
+        logger.info("🐍 Using native Python GGUFConverter...")
+        logger.info(f"✅ Found {len(safetensor_files)} SafeTensors files")
+
+        # Load model configuration
+        config_parser = ConfigParser()
+        model_config = config_parser.load_model_config(model_dir)
+
+        # Get architecture mapping
+        arch_name = model_config.architectures[0] if model_config.architectures else "llama"
+        arch = config_parser.get_architecture_mapping(arch_name)
+
+        if arch != arch_name:
+            logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
+
+        # Convert using GGUFConverter
+        tensor_mapper = TensorMapper()
+        success = GGUFConverter.convert_safetensors(
+            model_dir, f16_model, model_config, arch, tensor_mapper
+        )
+
+        if not success:
+            logger.error("❌ Native Python conversion failed")
+            msg = "Failed to convert SafeTensors model to GGUF"
+            raise RuntimeError(msg)
+
+        logger.info("✅ Native Python conversion successful")
         return f16_model
 
 
@@ -437,50 +462,214 @@ class HuggingFaceUploader:
         """Upload or update README file to repository.
 
         Creates repository if needed, handles existing repository updates.
+
+        Raises:
+            RuntimeError: If the README upload fails.
         """
         logger.info("Uploading README...")
+
+        # First ensure the repository exists
+        self._ensure_repo_exists(output_repo)
+
+        # Upload without --create flag to avoid PR creation
         try:
-            subprocess.run(
+            logger.debug(f"DEBUG: Uploading README to {output_repo}")
+            result = subprocess.run(
                 [
                     "huggingface-cli",
                     "upload",
                     output_repo,
                     str(readme_path),
                     "README.md",
-                    "--create",
+                    "--commit-message",
+                    "Update README.md",
                 ],
                 check=True,
                 capture_output=True,
+                text=True,
             )
-            logger.info("README uploaded")
-        except subprocess.CalledProcessError:
-            # Repository exists, update without --create
+            logger.debug(f"DEBUG: README upload completed with return code {result.returncode}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"❌ Failed to upload README to {output_repo}")
+            logger.error(f"Return code: {e.returncode}")
+            if e.stderr:
+                logger.error(f"stderr: {e.stderr}")
+            if e.stdout:
+                logger.error(f"stdout: {e.stdout}")
+            msg = f"README upload failed: {e}"
+            raise RuntimeError(msg) from e
+        except Exception as e:
+            logger.error(f"❌ Unexpected error during README upload: {e}")
+            logger.error("Exception traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            raise
+        logger.info("README uploaded")
+
+    def _ensure_repo_exists(self, repo_id: str) -> None:
+        """Ensure the repository exists, creating it if necessary."""
+        try:
+            # Try to create the repo - will fail if it already exists
             subprocess.run(
                 [
                     "huggingface-cli",
-                    "upload",
-                    output_repo,
-                    str(readme_path),
-                    "README.md",
+                    "repo",
+                    "create",
+                    repo_id,
+                    "--type",
+                    "model",
+                    "-y",
                 ],
                 check=True,
+                capture_output=True,
+                text=True,
             )
-            logger.info("README updated")
+            logger.info(f"Created repository: {repo_id}")
+        except subprocess.CalledProcessError:
+            # Repository already exists, that's fine
+            pass
 
     def upload_model_file(self, output_repo: str, model_path: Path) -> None:
         """Upload model file to repository.
 
         Uploads GGUF model file to specified repository path.
+        Always uses huggingface-cli to ensure proper handling of large files
+        via HuggingFace's xet backend.
+
+        Raises:
+            RuntimeError: If the model file upload fails.
         """
         logger.info(f"Uploading {model_path.name}...")
-        subprocess.run(
-            [
-                "huggingface-cli",
-                "upload",
-                output_repo,
-                str(model_path),
-                model_path.name,
-            ],
-            check=True,
-        )
+
+        # Always use huggingface-cli for model files to ensure xet backend is used
+        try:
+            logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}")
+            result = subprocess.run(
+                [
+                    "huggingface-cli",
+                    "upload",
+                    output_repo,
+                    str(model_path),
+                    model_path.name,
+                    "--revision",
+                    "main",  # Explicitly push to main branch
+                    "--commit-message",
+                    f"Add {model_path.name}",
+                ],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            logger.debug(f"DEBUG: Model upload completed with return code {result.returncode}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"❌ Failed to upload model file {model_path.name} to {output_repo}")
+            logger.error(f"Return code: {e.returncode}")
+            if e.stderr:
+                logger.error(f"stderr: {e.stderr}")
+            if e.stdout:
+                logger.error(f"stdout: {e.stdout}")
+            msg = f"Model file upload failed: {e}"
+            raise RuntimeError(msg) from e
+        except Exception as e:
+            logger.error(f"❌ Unexpected error during model file upload: {e}")
+            logger.error("Exception traceback:")
+            for line in traceback.format_exc().splitlines():
+                logger.error(f"  {line}")
+            raise
+
+        # Extract and log the URL if present in output
+        if result.stdout:
+            for line in result.stdout.splitlines():
+                if "https://huggingface.co/" in line:
+                    logger.info(f"Upload URL: {line.strip()}")
+                    break
+
         logger.info(f"{model_path.name} uploaded")
+
+    def _try_git_upload_file(
+        self,
+        repo_id: str,
+        local_path: Path,
+        repo_path: str,
+        *,
+        create_repo: bool = False,
+    ) -> bool:
+        """Try to upload file using git directly to avoid PR creation.
+
+        Returns:
+            bool: True if upload successful, False if should fallback to CLI.
+        """
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+                repo_url = f"https://huggingface.co/{repo_id}"
+
+                # Clone repository
+                logger.info(f"Cloning {repo_url}...")
+                result = subprocess.run(
+                    ["git", "clone", repo_url, str(temp_path / "repo")],
+                    check=False,
+                    capture_output=True,
+                    text=True,
+                )
+
+                if result.returncode != 0:
+                    if create_repo:
+                        # Repository doesn't exist, let huggingface-cli handle creation
+                        return False
+                    logger.warning(f"Clone failed: {result.stderr}")
+                    return False
+
+                repo_dir = temp_path / "repo"
+                target_file = repo_dir / repo_path
+
+                # Ensure target directory exists
+                target_file.parent.mkdir(parents=True, exist_ok=True)
+
+                # Copy file
+                shutil.copy2(local_path, target_file)
+
+                # Check if there are any changes
+                status_result = subprocess.run(
+                    ["git", "status", "--porcelain"],
+                    cwd=repo_dir,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+
+                if not status_result.stdout.strip():
+                    logger.info(f"No changes detected for {repo_path}, file already up-to-date")
+                    return True  # File is already up-to-date, no need to push
+
+                # Git add, commit, push
+                subprocess.run(
+                    ["git", "add", repo_path],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                subprocess.run(
+                    ["git", "commit", "-m", f"Update {repo_path}"],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                subprocess.run(
+                    ["git", "push"],
+                    cwd=repo_dir,
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+
+                return True
+
+        except subprocess.CalledProcessError as e:
+            logger.warning(f"Git upload failed: {e}")
+            return False
+        except Exception as e:
+            logger.warning(f"Git upload error: {e}")
+            return False
diff --git a/helpers/utils/__init__.py b/helpers/utils/__init__.py
index 7c7f1d5..61fbe2d 100644
--- a/helpers/utils/__init__.py
+++ b/helpers/utils/__init__.py
@@ -3,14 +3,3 @@
 Provides low-level utilities for tensor mapping, configuration parsing,
 and other common operations. Uses UK English spelling conventions throughout.
 """
-
-from __future__ import annotations
-
-from helpers.utils.config_parser import ConfigParser
-from helpers.utils.tensor_mapping import TensorMapper, URLParser
-
-__all__ = [
-    "ConfigParser",
-    "TensorMapper",
-    "URLParser",
-]
diff --git a/helpers/utils/config_parser.py b/helpers/utils/config_parser.py
index 5dc9764..5df8ed0 100644
--- a/helpers/utils/config_parser.py
+++ b/helpers/utils/config_parser.py
@@ -68,13 +68,11 @@ class ConfigParser:
 
         Translates HuggingFace model configuration to GGUF parameter format,
         providing sensible defaults for missing values and handling various
-        architecture conventions.
-
-        Args:
-            config: Parsed ModelConfig instance.
+        architecture conventions. Calculates derived parameters like RoPE
+        dimensions and handles grouped-query attention configurations.
 
         Returns:
-            GGUFParameters with inferred values.
+            GGUFParameters with inferred values and proper type validation.
         """
         # Calculate derived parameters
         num_heads = config.num_attention_heads
@@ -112,13 +110,11 @@ class ConfigParser:
         """Map architecture names to known GGUF architectures.
 
         Provides fallback mappings for architectures not directly supported
-        by GGUF, mapping them to similar known architectures.
-
-        Args:
-            architecture: Original architecture name from config.
+        by GGUF format, translating them to similar known architectures. This
+        enables broader model compatibility whilst maintaining GGUF standards.
 
         Returns:
-            GGUF-compatible architecture name.
+            GGUF-compatible architecture name with appropriate fallback to llama.
         """
         # Architecture mappings to known GGUF types
         mappings = {
@@ -138,14 +134,12 @@ class ConfigParser:
     def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
         """Load tokeniser configuration from model directory.
 
-        Reads tokenizer_config.json to extract special token IDs and
-        other tokenisation parameters.
-
-        Args:
-            model_path: Path to model directory.
+        Reads tokenizer_config.json to extract special token IDs and other
+        tokenisation parameters required for GGUF metadata. Provides sensible
+        defaults when configuration files are missing or incomplete.
 
         Returns:
-            Tokeniser configuration dictionary.
+            Tokeniser configuration dictionary with token IDs and model type.
         """
         fs = FilesystemService()
         tokeniser_config_path = model_path / "tokenizer_config.json"
diff --git a/helpers/utils/tensor_mapping.py b/helpers/utils/tensor_mapping.py
index a8a079b..6c7a5b2 100644
--- a/helpers/utils/tensor_mapping.py
+++ b/helpers/utils/tensor_mapping.py
@@ -72,13 +72,11 @@ class TensorMapper:
         """Map layer-specific tensor names.
 
         Handles tensors within transformer layers, extracting layer indices
-        and mapping component names to GGUF conventions.
-
-        Args:
-            tensor_name: Layer tensor name containing .layers.N. pattern.
+        and mapping component names to GGUF conventions. Supports attention
+        projections, feed-forward networks, and normalisation layers.
 
         Returns:
-            Mapped GGUF tensor name, or None if unmappable.
+            Mapped GGUF tensor name with layer index, or None if unmappable.
         """
         # Extract layer number
         parts = tensor_name.split(".")
@@ -112,16 +110,14 @@ class URLParser:
         """Parse URL and extract model source information.
 
         Analyses URL format to determine source type and extract relevant
-        metadata for model download and processing.
-
-        Args:
-            url: Model URL in supported format.
+        metadata for model download and processing. Supports both standard
+        HuggingFace URLs and Ollama-style GGUF repository references.
 
         Returns:
-            ModelSource with parsed information.
+            ModelSource with parsed metadata and appropriate source type.
 
         Raises:
-            ValueError: If URL format is not recognised.
+            ValueError: If URL format is not recognised or supported.
         """
         if not url:
             msg = "URL cannot be empty"
@@ -166,18 +162,12 @@ class URLParser:
     ) -> ModelSource:
         """Create ModelSource with parsed information.
 
-        Constructs a ModelSource instance with extracted metadata,
-        handling author/model name splitting and GGUF suffix removal.
-
-        Args:
-            url: Original URL.
-            url_type: Type of URL (HuggingFace or Ollama GGUF).
-            source_model: Repository identifier (author/model).
-            gguf_file_pattern: Optional GGUF file pattern.
-            is_gguf_repo: Whether this is a GGUF repository.
+        Constructs a ModelSource instance with extracted metadata, handling
+        author/model name splitting and GGUF suffix removal for repository names.
+        Ensures consistent naming conventions across different source types.
 
         Returns:
-            Configured ModelSource instance.
+            Configured ModelSource instance with normalised metadata.
         """
         author, model_name = source_model.split("/", 1)
 
diff --git a/pyproject.toml b/pyproject.toml
index 609e761..4dfef4a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["gguf>=0", "pydantic>=2", "safetensors>=0", "torch>=2"]
+dependencies = [
+    "gguf>=0",
+    "llama-cpp-python>=0",
+    "psutil>=7",
+    "pydantic>=2",
+    "safetensors>=0",
+    "torch>=2",
+]
 
 [project.urls]
 Homepage = "https://git.tomfos.tr/tom/llm-gguf-tools"
@@ -24,7 +31,7 @@ Homepage = "https://git.tomfos.tr/tom/llm-gguf-tools"
 "Source" = "https://git.tomfos.tr/tom/llm-gguf-tools"
 
 [dependency-groups]
-dev = ["pytest>=8", "ruff>=0", "uv>=0"]
+dev = ["mypy>=1", "pytest>=8", "ruff>=0", "types-psutil>=7", "uv>=0"]
 
 [tool.uv]
 package = true
@@ -72,6 +79,7 @@ ignore = [
     "PLR0913", # too many arguments
     "PLR0915", # too many statements
     "PLR0917", # too many positional arguments
+    "PLR2004", # magic numbers
     "PLR6301", # method could be static
     "RUF029",  # async methods that don't await
     "S104",    # binding to all interfaces
@@ -94,3 +102,6 @@ required-imports = ["from __future__ import annotations"]
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
+
+[tool.mypy]
+exclude = ["work/"]
diff --git a/quantise_gguf.py b/quantise_gguf.py
index 529f45d..53fde06 100644
--- a/quantise_gguf.py
+++ b/quantise_gguf.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
-"""Bartowski Quantisation Script for advanced GGUF model processing.
+"""Advanced Quantisation Script for GGUF model processing.
 
-Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L,
-Q4_K_XL, and Q4_K_XXL methods with tensor-level precision control. Features
-parallel processing, status tracking, automatic README generation, and
-HuggingFace integration for streamlined model distribution workflows.
+Implements a sophisticated quantisation pipeline supporting Q4_K_M, Q4_K_L, Q4_K_XL and custom
+profiles with tensor-level precision control. Features parallel processing, status tracking,
+automatic README generation, and HuggingFace integration for streamlined model distribution
+workflows.
 
 Usage: python quantise.py <huggingface_url>
 """
@@ -28,45 +28,38 @@ def main() -> None:
     to quantised GGUF files with optional HuggingFace upload and cleanup.
     """
     parser = argparse.ArgumentParser(
-        description="Bartowski Quantisation Script - Supports Q4_K_M, Q4_K_L, Q4_K_XL, Q4_K_XXL",
+        description=(
+            "GGUF model quantisation tool supporting Q2-Q8 formats including K-quants, "
+            "legacy formats, and Bartowski method variants with tensor-specific precision "
+            "for embeddings and output layers."
+        ),
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-  python quantise.py https://huggingface.co/DavidAU/Gemma-3-4b-it-Uncensored-DBL-X
-  python quantise.py hf.co/DavidAU/Gemma-3-it-4B-Uncensored-DBL-X-GGUF:F16
+  uv run quantise.py https://huggingface.co/MyUser/SafeTensorModelRepo
+  uv run quantise.py hf.co/MyUser/Model-Repo-GGUF:F16
         """,
     )
     parser.add_argument("url", help="HuggingFace model URL")
-    parser.add_argument(
-        "--work-dir", type=Path, help="Working directory (default: ./quantisation_work)"
-    )
+    parser.add_argument("--work-dir", type=Path, help="Working directory (default: ./work)")
     parser.add_argument(
         "--no-imatrix",
         action="store_true",
-        help="Skip imatrix generation (faster but lower quality)",
-    )
-    parser.add_argument(
-        "--imatrix-base",
-        choices=[
-            "Q2_K",
-            "Q3_K_L",
-            "Q3_K_M",
-            "Q3_K_S",
-            "Q4_K_S",
-            "Q4_K_M",
-            "Q5_K_S",
-            "Q5_K_M",
-            "Q6_K",
-            "Q8_0",
-        ],
-        default="Q4_K_M",
-        help="Base quantisation for imatrix generation",
+        help="Skip checking for imatrix files (faster but lower quality)",
     )
     parser.add_argument(
         "--no-upload",
         action="store_true",
         help="Skip uploading to HuggingFace (local testing only)",
     )
+    parser.add_argument(
+        "--profiles",
+        nargs="*",
+        help=(
+            "Quantisation profiles to use "
+            "(default: Q3_K_M Q3_K_L Q3_K_XL Q4_K_M Q4_K_L Q5_K_M Q6_K Q6_K_L Q8_0)"
+        ),
+    )
 
     args = parser.parse_args()
 
@@ -76,10 +69,10 @@ Examples:
 
     try:
         orchestrator = QuantisationOrchestrator(
-            work_dir=args.work_dir or Path.cwd() / "quantisation_work",
+            work_dir=args.work_dir or Path.cwd() / "work",
             use_imatrix=not args.no_imatrix,
-            imatrix_base=args.imatrix_base,
             no_upload=args.no_upload,
+            custom_profiles=args.profiles,
         )
         orchestrator.quantise(args.url)
 
diff --git a/uv.lock b/uv.lock
index 8fc56ab..d2bf4be 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.13"
 resolution-markers = [
     "sys_platform != 'darwin'",
@@ -23,6 +23,15 @@ wheels = [
     { url = "https://download.pytorch.org/whl/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" },
 ]
 
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" },
+]
+
 [[package]]
 name = "filelock"
 version = "3.13.1"
@@ -73,12 +82,26 @@ wheels = [
     { url = "https://download.pytorch.org/whl/Jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d" },
 ]
 
+[[package]]
+name = "llama-cpp-python"
+version = "0.3.15"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "diskcache" },
+    { name = "jinja2" },
+    { name = "numpy" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/fa/8917d9655d105f7675106a6076c489ed23d79d42b79bad56a444c9458735/llama_cpp_python-0.3.15.tar.gz", hash = "sha256:a2cf42935a9ff9e55804db01d6827b4862d7ab10ae72ea8e38b7d180d2c640f3", size = 50649779, upload-time = "2025-08-07T13:56:54.856Z" }
+
 [[package]]
 name = "llm-gguf-tools"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "gguf" },
+    { name = "llama-cpp-python" },
+    { name = "psutil" },
     { name = "pydantic" },
     { name = "safetensors" },
     { name = "torch", version = "2.8.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
@@ -87,14 +110,18 @@ dependencies = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "mypy" },
     { name = "pytest" },
     { name = "ruff" },
+    { name = "types-psutil" },
     { name = "uv" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "gguf", specifier = ">=0" },
+    { name = "llama-cpp-python", specifier = ">=0" },
+    { name = "psutil", specifier = ">=7" },
     { name = "pydantic", specifier = ">=2" },
     { name = "safetensors", specifier = ">=0" },
     { name = "torch", specifier = ">=2", index = "https://download.pytorch.org/whl/cpu" },
@@ -102,8 +129,10 @@ requires-dist = [
 
 [package.metadata.requires-dev]
 dev = [
+    { name = "mypy", specifier = ">=1" },
     { name = "pytest", specifier = ">=8" },
     { name = "ruff", specifier = ">=0" },
+    { name = "types-psutil", specifier = ">=7" },
     { name = "uv", specifier = ">=0" },
 ]
 
@@ -123,6 +152,40 @@ wheels = [
     { url = "https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c" },
 ]
 
+[[package]]
+name = "mypy"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mypy-extensions" },
+    { name = "pathspec" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8e/22/ea637422dedf0bf36f3ef238eab4e455e2a0dcc3082b5cc067615347ab8e/mypy-1.17.1.tar.gz", hash = "sha256:25e01ec741ab5bb3eec8ba9cdb0f769230368a22c959c4937360efb89b7e9f01", size = 3352570, upload-time = "2025-07-31T07:54:19.204Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5b/82/aec2fc9b9b149f372850291827537a508d6c4d3664b1750a324b91f71355/mypy-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93378d3203a5c0800c6b6d850ad2f19f7a3cdf1a3701d3416dbf128805c6a6a7", size = 11075338, upload-time = "2025-07-31T07:53:38.873Z" },
+    { url = "https://files.pythonhosted.org/packages/07/ac/ee93fbde9d2242657128af8c86f5d917cd2887584cf948a8e3663d0cd737/mypy-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:15d54056f7fe7a826d897789f53dd6377ec2ea8ba6f776dc83c2902b899fee81", size = 10113066, upload-time = "2025-07-31T07:54:14.707Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/68/946a1e0be93f17f7caa56c45844ec691ca153ee8b62f21eddda336a2d203/mypy-1.17.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:209a58fed9987eccc20f2ca94afe7257a8f46eb5df1fb69958650973230f91e6", size = 11875473, upload-time = "2025-07-31T07:53:14.504Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/0f/478b4dce1cb4f43cf0f0d00fba3030b21ca04a01b74d1cd272a528cf446f/mypy-1.17.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:099b9a5da47de9e2cb5165e581f158e854d9e19d2e96b6698c0d64de911dd849", size = 12744296, upload-time = "2025-07-31T07:53:03.896Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/70/afa5850176379d1b303f992a828de95fc14487429a7139a4e0bdd17a8279/mypy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ffadfbe6994d724c5a1bb6123a7d27dd68fc9c059561cd33b664a79578e14", size = 12914657, upload-time = "2025-07-31T07:54:08.576Z" },
+    { url = "https://files.pythonhosted.org/packages/53/f9/4a83e1c856a3d9c8f6edaa4749a4864ee98486e9b9dbfbc93842891029c2/mypy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:9a2b7d9180aed171f033c9f2fc6c204c1245cf60b0cb61cf2e7acc24eea78e0a", size = 9593320, upload-time = "2025-07-31T07:53:01.341Z" },
+    { url = "https://files.pythonhosted.org/packages/38/56/79c2fac86da57c7d8c48622a05873eaab40b905096c33597462713f5af90/mypy-1.17.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:15a83369400454c41ed3a118e0cc58bd8123921a602f385cb6d6ea5df050c733", size = 11040037, upload-time = "2025-07-31T07:54:10.942Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/c3/adabe6ff53638e3cad19e3547268482408323b1e68bf082c9119000cd049/mypy-1.17.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55b918670f692fc9fba55c3298d8a3beae295c5cded0a55dccdc5bbead814acd", size = 10131550, upload-time = "2025-07-31T07:53:41.307Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/c5/2e234c22c3bdeb23a7817af57a58865a39753bde52c74e2c661ee0cfc640/mypy-1.17.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:62761474061feef6f720149d7ba876122007ddc64adff5ba6f374fda35a018a0", size = 11872963, upload-time = "2025-07-31T07:53:16.878Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/26/c13c130f35ca8caa5f2ceab68a247775648fdcd6c9a18f158825f2bc2410/mypy-1.17.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c49562d3d908fd49ed0938e5423daed8d407774a479b595b143a3d7f87cdae6a", size = 12710189, upload-time = "2025-07-31T07:54:01.962Z" },
+    { url = "https://files.pythonhosted.org/packages/82/df/c7d79d09f6de8383fe800521d066d877e54d30b4fb94281c262be2df84ef/mypy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:397fba5d7616a5bc60b45c7ed204717eaddc38f826e3645402c426057ead9a91", size = 12900322, upload-time = "2025-07-31T07:53:10.551Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/98/3d5a48978b4f708c55ae832619addc66d677f6dc59f3ebad71bae8285ca6/mypy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:9d6b20b97d373f41617bd0708fd46aa656059af57f2ef72aa8c7d6a2b73b74ed", size = 9751879, upload-time = "2025-07-31T07:52:56.683Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/f3/8fcd2af0f5b806f6cf463efaffd3c9548a28f84220493ecd38d127b6b66d/mypy-1.17.1-py3-none-any.whl", hash = "sha256:a9f52c0351c21fe24c21d8c0eb1f62967b262d6729393397b6f443c3b773c3b9", size = 2283411, upload-time = "2025-07-31T07:53:24.664Z" },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.0.0"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
+wheels = [
+    { url = "https://download.pytorch.org/whl/mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d" },
+]
+
 [[package]]
 name = "networkx"
 version = "3.3"
@@ -159,6 +222,15 @@ wheels = [
     { url = "https://download.pytorch.org/whl/packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124" },
 ]
 
+[[package]]
+name = "pathspec"
+version = "0.12.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@@ -168,6 +240,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 
+[[package]]
+name = "psutil"
+version = "7.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
+    { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
+    { url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053, upload-time = "2025-02-13T21:54:34.31Z" },
+    { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" },
+]
+
 [[package]]
 name = "pydantic"
 version = "2.11.7"
@@ -255,49 +342,49 @@ wheels = [
 
 [[package]]
 name = "ruff"
-version = "0.12.7"
+version = "0.12.8"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a1/81/0bd3594fa0f690466e41bd033bdcdf86cba8288345ac77ad4afbe5ec743a/ruff-0.12.7.tar.gz", hash = "sha256:1fc3193f238bc2d7968772c82831a4ff69252f673be371fb49663f0068b7ec71", size = 5197814, upload-time = "2025-07-29T22:32:35.877Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4b/da/5bd7565be729e86e1442dad2c9a364ceeff82227c2dece7c29697a9795eb/ruff-0.12.8.tar.gz", hash = "sha256:4cb3a45525176e1009b2b64126acf5f9444ea59066262791febf55e40493a033", size = 5242373, upload-time = "2025-08-07T19:05:47.268Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/d2/6cb35e9c85e7a91e8d22ab32ae07ac39cc34a71f1009a6f9e4a2a019e602/ruff-0.12.7-py3-none-linux_armv6l.whl", hash = "sha256:76e4f31529899b8c434c3c1dede98c4483b89590e15fb49f2d46183801565303", size = 11852189, upload-time = "2025-07-29T22:31:41.281Z" },
-    { url = "https://files.pythonhosted.org/packages/63/5b/a4136b9921aa84638f1a6be7fb086f8cad0fde538ba76bda3682f2599a2f/ruff-0.12.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:789b7a03e72507c54fb3ba6209e4bb36517b90f1a3569ea17084e3fd295500fb", size = 12519389, upload-time = "2025-07-29T22:31:54.265Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/c9/3e24a8472484269b6b1821794141f879c54645a111ded4b6f58f9ab0705f/ruff-0.12.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2e1c2a3b8626339bb6369116e7030a4cf194ea48f49b64bb505732a7fce4f4e3", size = 11743384, upload-time = "2025-07-29T22:31:59.575Z" },
-    { url = "https://files.pythonhosted.org/packages/26/7c/458dd25deeb3452c43eaee853c0b17a1e84169f8021a26d500ead77964fd/ruff-0.12.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32dec41817623d388e645612ec70d5757a6d9c035f3744a52c7b195a57e03860", size = 11943759, upload-time = "2025-07-29T22:32:01.95Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/8b/658798472ef260ca050e400ab96ef7e85c366c39cf3dfbef4d0a46a528b6/ruff-0.12.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47ef751f722053a5df5fa48d412dbb54d41ab9b17875c6840a58ec63ff0c247c", size = 11654028, upload-time = "2025-07-29T22:32:04.367Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/86/9c2336f13b2a3326d06d39178fd3448dcc7025f82514d1b15816fe42bfe8/ruff-0.12.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a828a5fc25a3efd3e1ff7b241fd392686c9386f20e5ac90aa9234a5faa12c423", size = 13225209, upload-time = "2025-07-29T22:32:06.952Z" },
-    { url = "https://files.pythonhosted.org/packages/76/69/df73f65f53d6c463b19b6b312fd2391dc36425d926ec237a7ed028a90fc1/ruff-0.12.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5726f59b171111fa6a69d82aef48f00b56598b03a22f0f4170664ff4d8298efb", size = 14182353, upload-time = "2025-07-29T22:32:10.053Z" },
-    { url = "https://files.pythonhosted.org/packages/58/1e/de6cda406d99fea84b66811c189b5ea139814b98125b052424b55d28a41c/ruff-0.12.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74e6f5c04c4dd4aba223f4fe6e7104f79e0eebf7d307e4f9b18c18362124bccd", size = 13631555, upload-time = "2025-07-29T22:32:12.644Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/ae/625d46d5164a6cc9261945a5e89df24457dc8262539ace3ac36c40f0b51e/ruff-0.12.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0bfe4e77fba61bf2ccadf8cf005d6133e3ce08793bbe870dd1c734f2699a3e", size = 12667556, upload-time = "2025-07-29T22:32:15.312Z" },
-    { url = "https://files.pythonhosted.org/packages/55/bf/9cb1ea5e3066779e42ade8d0cd3d3b0582a5720a814ae1586f85014656b6/ruff-0.12.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06bfb01e1623bf7f59ea749a841da56f8f653d641bfd046edee32ede7ff6c606", size = 12939784, upload-time = "2025-07-29T22:32:17.69Z" },
-    { url = "https://files.pythonhosted.org/packages/55/7f/7ead2663be5627c04be83754c4f3096603bf5e99ed856c7cd29618c691bd/ruff-0.12.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e41df94a957d50083fd09b916d6e89e497246698c3f3d5c681c8b3e7b9bb4ac8", size = 11771356, upload-time = "2025-07-29T22:32:20.134Z" },
-    { url = "https://files.pythonhosted.org/packages/17/40/a95352ea16edf78cd3a938085dccc55df692a4d8ba1b3af7accbe2c806b0/ruff-0.12.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4000623300563c709458d0ce170c3d0d788c23a058912f28bbadc6f905d67afa", size = 11612124, upload-time = "2025-07-29T22:32:22.645Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/74/633b04871c669e23b8917877e812376827c06df866e1677f15abfadc95cb/ruff-0.12.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:69ffe0e5f9b2cf2b8e289a3f8945b402a1b19eff24ec389f45f23c42a3dd6fb5", size = 12479945, upload-time = "2025-07-29T22:32:24.765Z" },
-    { url = "https://files.pythonhosted.org/packages/be/34/c3ef2d7799c9778b835a76189c6f53c179d3bdebc8c65288c29032e03613/ruff-0.12.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a07a5c8ffa2611a52732bdc67bf88e243abd84fe2d7f6daef3826b59abbfeda4", size = 12998677, upload-time = "2025-07-29T22:32:27.022Z" },
-    { url = "https://files.pythonhosted.org/packages/77/ab/aca2e756ad7b09b3d662a41773f3edcbd262872a4fc81f920dc1ffa44541/ruff-0.12.7-py3-none-win32.whl", hash = "sha256:c928f1b2ec59fb77dfdf70e0419408898b63998789cc98197e15f560b9e77f77", size = 11756687, upload-time = "2025-07-29T22:32:29.381Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/71/26d45a5042bc71db22ddd8252ca9d01e9ca454f230e2996bb04f16d72799/ruff-0.12.7-py3-none-win_amd64.whl", hash = "sha256:9c18f3d707ee9edf89da76131956aba1270c6348bfee8f6c647de841eac7194f", size = 12912365, upload-time = "2025-07-29T22:32:31.517Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/9b/0b8aa09817b63e78d94b4977f18b1fcaead3165a5ee49251c5d5c245bb2d/ruff-0.12.7-py3-none-win_arm64.whl", hash = "sha256:dfce05101dbd11833a0776716d5d1578641b7fddb537fe7fa956ab85d1769b69", size = 11982083, upload-time = "2025-07-29T22:32:33.881Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/1e/c843bfa8ad1114fab3eb2b78235dda76acd66384c663a4e0415ecc13aa1e/ruff-0.12.8-py3-none-linux_armv6l.whl", hash = "sha256:63cb5a5e933fc913e5823a0dfdc3c99add73f52d139d6cd5cc8639d0e0465513", size = 11675315, upload-time = "2025-08-07T19:05:06.15Z" },
+    { url = "https://files.pythonhosted.org/packages/24/ee/af6e5c2a8ca3a81676d5480a1025494fd104b8896266502bb4de2a0e8388/ruff-0.12.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9a9bbe28f9f551accf84a24c366c1aa8774d6748438b47174f8e8565ab9dedbc", size = 12456653, upload-time = "2025-08-07T19:05:09.759Z" },
+    { url = "https://files.pythonhosted.org/packages/99/9d/e91f84dfe3866fa648c10512904991ecc326fd0b66578b324ee6ecb8f725/ruff-0.12.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2fae54e752a3150f7ee0e09bce2e133caf10ce9d971510a9b925392dc98d2fec", size = 11659690, upload-time = "2025-08-07T19:05:12.551Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/ac/a363d25ec53040408ebdd4efcee929d48547665858ede0505d1d8041b2e5/ruff-0.12.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0acbcf01206df963d9331b5838fb31f3b44fa979ee7fa368b9b9057d89f4a53", size = 11896923, upload-time = "2025-08-07T19:05:14.821Z" },
+    { url = "https://files.pythonhosted.org/packages/58/9f/ea356cd87c395f6ade9bb81365bd909ff60860975ca1bc39f0e59de3da37/ruff-0.12.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ae3e7504666ad4c62f9ac8eedb52a93f9ebdeb34742b8b71cd3cccd24912719f", size = 11477612, upload-time = "2025-08-07T19:05:16.712Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/46/92e8fa3c9dcfd49175225c09053916cb97bb7204f9f899c2f2baca69e450/ruff-0.12.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb82efb5d35d07497813a1c5647867390a7d83304562607f3579602fa3d7d46f", size = 13182745, upload-time = "2025-08-07T19:05:18.709Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/c4/f2176a310f26e6160deaf661ef60db6c3bb62b7a35e57ae28f27a09a7d63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dbea798fc0065ad0b84a2947b0aff4233f0cb30f226f00a2c5850ca4393de609", size = 14206885, upload-time = "2025-08-07T19:05:21.025Z" },
+    { url = "https://files.pythonhosted.org/packages/87/9d/98e162f3eeeb6689acbedbae5050b4b3220754554526c50c292b611d3a63/ruff-0.12.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:49ebcaccc2bdad86fd51b7864e3d808aad404aab8df33d469b6e65584656263a", size = 13639381, upload-time = "2025-08-07T19:05:23.423Z" },
+    { url = "https://files.pythonhosted.org/packages/81/4e/1b7478b072fcde5161b48f64774d6edd59d6d198e4ba8918d9f4702b8043/ruff-0.12.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ac9c570634b98c71c88cb17badd90f13fc076a472ba6ef1d113d8ed3df109fb", size = 12613271, upload-time = "2025-08-07T19:05:25.507Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818", size = 12847783, upload-time = "2025-08-07T19:05:28.14Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/2a/0b6ac3dd045acf8aa229b12c9c17bb35508191b71a14904baf99573a21bd/ruff-0.12.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:71c83121512e7743fba5a8848c261dcc454cafb3ef2934a43f1b7a4eb5a447ea", size = 11702672, upload-time = "2025-08-07T19:05:30.413Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/ee/f9fdc9f341b0430110de8b39a6ee5fa68c5706dc7c0aa940817947d6937e/ruff-0.12.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:de4429ef2ba091ecddedd300f4c3f24bca875d3d8b23340728c3cb0da81072c3", size = 11440626, upload-time = "2025-08-07T19:05:32.492Z" },
+    { url = "https://files.pythonhosted.org/packages/89/fb/b3aa2d482d05f44e4d197d1de5e3863feb13067b22c571b9561085c999dc/ruff-0.12.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a2cab5f60d5b65b50fba39a8950c8746df1627d54ba1197f970763917184b161", size = 12462162, upload-time = "2025-08-07T19:05:34.449Z" },
+    { url = "https://files.pythonhosted.org/packages/18/9f/5c5d93e1d00d854d5013c96e1a92c33b703a0332707a7cdbd0a4880a84fb/ruff-0.12.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:45c32487e14f60b88aad6be9fd5da5093dbefb0e3e1224131cb1d441d7cb7d46", size = 12913212, upload-time = "2025-08-07T19:05:36.541Z" },
+    { url = "https://files.pythonhosted.org/packages/71/13/ab9120add1c0e4604c71bfc2e4ef7d63bebece0cfe617013da289539cef8/ruff-0.12.8-py3-none-win32.whl", hash = "sha256:daf3475060a617fd5bc80638aeaf2f5937f10af3ec44464e280a9d2218e720d3", size = 11694382, upload-time = "2025-08-07T19:05:38.468Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/dc/a2873b7c5001c62f46266685863bee2888caf469d1edac84bf3242074be2/ruff-0.12.8-py3-none-win_amd64.whl", hash = "sha256:7209531f1a1fcfbe8e46bcd7ab30e2f43604d8ba1c49029bb420b103d0b5f76e", size = 12740482, upload-time = "2025-08-07T19:05:40.391Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/5c/799a1efb8b5abab56e8a9f2a0b72d12bd64bb55815e9476c7d0a2887d2f7/ruff-0.12.8-py3-none-win_arm64.whl", hash = "sha256:c90e1a334683ce41b0e7a04f41790c429bf5073b62c1ae701c9dc5b3d14f0749", size = 11884718, upload-time = "2025-08-07T19:05:42.866Z" },
 ]
 
 [[package]]
 name = "safetensors"
-version = "0.6.1"
+version = "0.6.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6c/d2/94fe37355a1d4ff86b0f43b9a018515d5d29bf7ad6d01318a80f5db2fd6a/safetensors-0.6.1.tar.gz", hash = "sha256:a766ba6e19b198eff09be05f24cd89eda1670ed404ae828e2aa3fc09816ba8d8", size = 197968, upload-time = "2025-08-06T09:39:38.376Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6b/c0/40263a2103511917f9a92b4e114ecaff68586df07f12d1d877312f1261f3/safetensors-0.6.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:81ed1b69d6f8acd7e759a71197ce3a69da4b7e9faa9dbb005eb06a83b1a4e52d", size = 455232, upload-time = "2025-08-06T09:39:32.037Z" },
-    { url = "https://files.pythonhosted.org/packages/86/bf/432cb4bb1c336d338dd9b29f78622b1441ee06e5868bf1de2ca2bec74c08/safetensors-0.6.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:01b51af8cb7a3870203f2735e3c7c24d1a65fb2846e75613c8cf9d284271eccc", size = 432150, upload-time = "2025-08-06T09:39:31.008Z" },
-    { url = "https://files.pythonhosted.org/packages/05/d7/820c99032a53d57279ae199df7d114a8c9e2bbce4fa69bc0de53743495f0/safetensors-0.6.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64a733886d79e726899b9d9643813e48a2eec49f3ef0fdb8cd4b8152046101c3", size = 471634, upload-time = "2025-08-06T09:39:22.17Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/8b/bcd960087eded7690f118ceeda294912f92a3b508a1d9a504f9c2e02041b/safetensors-0.6.1-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f233dc3b12fb641b36724844754b6bb41349615a0e258087560968d6da92add5", size = 487855, upload-time = "2025-08-06T09:39:24.142Z" },
-    { url = "https://files.pythonhosted.org/packages/41/64/b44eac4ad87c4e1c0cf5ba5e204c032b1b1eac8ce2b8f65f87791e647bd6/safetensors-0.6.1-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6f16289e2af54affd591dd78ed12b5465e4dc5823f818beaeddd49a010cf3ba7", size = 607240, upload-time = "2025-08-06T09:39:25.463Z" },
-    { url = "https://files.pythonhosted.org/packages/52/75/0347fa0c080af8bd3341af26a30b85939f6362d4f5240add1a0c9d793354/safetensors-0.6.1-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1b62eab84e2c69918b598272504c5d2ebfe64da6c16fdf8682054eec9572534d", size = 519864, upload-time = "2025-08-06T09:39:26.872Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/f3/83843d1fe9164f44a267373c55cba706530b209b58415f807b40edddcd3e/safetensors-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d498363746555dccffc02a47dfe1dee70f7784f3f37f1d66b408366c5d3a989e", size = 485926, upload-time = "2025-08-06T09:39:29.109Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/26/f6b0cb5210bab0e343214fdba7c2df80a69b019e62e760ddc61b18bec383/safetensors-0.6.1-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eed2079dca3ca948d7b0d7120396e776bbc6680637cf199d393e157fde25c937", size = 518999, upload-time = "2025-08-06T09:39:28.054Z" },
-    { url = "https://files.pythonhosted.org/packages/90/b7/8910b165c97d3bd6d445c6ca8b704ec23d0fa33849ce9a51dc783827a302/safetensors-0.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:294040ff20ebe079a2b4976cfa9a5be0202f56ca4f7f190b4e52009e8c026ceb", size = 650669, upload-time = "2025-08-06T09:39:32.997Z" },
-    { url = "https://files.pythonhosted.org/packages/00/bc/2eeb025381d0834ae038aae2d383dfa830c2e0068e2e4e512ea99b135a4b/safetensors-0.6.1-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:75693208b492a026b926edeebbae888cc644433bee4993573ead2dc44810b519", size = 750019, upload-time = "2025-08-06T09:39:34.397Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/38/5dda9a8e056eb1f17ed3a7846698fd94623a1648013cdf522538845755da/safetensors-0.6.1-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:a8687b71ac67a0b3f8ce87df9e8024edf087e94c34ef46eaaad694dce8d2f83f", size = 689888, upload-time = "2025-08-06T09:39:35.584Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/60/15ee3961996d951002378d041bd82863a5c70738a71375b42d6dd5d2a6d3/safetensors-0.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5dd969a01c738104f707fa0e306b757f5beb3ebdcd682fe0724170a0bf1c21fb", size = 655539, upload-time = "2025-08-06T09:39:37.093Z" },
-    { url = "https://files.pythonhosted.org/packages/91/d6/01172a9c77c566800286d379bfc341d75370eae2118dfd339edfd0394c4a/safetensors-0.6.1-cp38-abi3-win32.whl", hash = "sha256:7c3d8d34d01673d1a917445c9437ee73a9d48bc6af10352b84bbd46c5da93ca5", size = 308594, upload-time = "2025-08-06T09:39:40.916Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/5d/195dc1917d7ae93dd990d9b2f8b9c88e451bcc78e0b63ee107beebc1e4be/safetensors-0.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:4720957052d57c5ac48912c3f6e07e9a334d9632758c9b0c054afba477fcbe2d", size = 320282, upload-time = "2025-08-06T09:39:39.54Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" },
+    { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" },
+    { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" },
 ]
 
 [[package]]
@@ -378,6 +465,15 @@ wheels = [
     { url = "https://download.pytorch.org/whl/tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd" },
 ]
 
+[[package]]
+name = "types-psutil"
+version = "7.0.0.20250801"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/5d/32fe570f7e22bf638a49c881c5e2142beeda9dad6b21a15805af66571cd8/types_psutil-7.0.0.20250801.tar.gz", hash = "sha256:0230b56234252cc6f59c361dccbaaa08f3088ea3569367abe6900485d388c97d", size = 20238, upload-time = "2025-08-01T03:47:39.309Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/84/d18c8d2b53ba2024d110494483b7bdcc9741b7285cd396307b2941353b4d/types_psutil-7.0.0.20250801-py3-none-any.whl", hash = "sha256:751842baf9e0efa31b3a7722a38a3f9afeb5a7132b146a1960cd472db362faa0", size = 23058, upload-time = "2025-08-01T03:47:38.151Z" },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -400,26 +496,26 @@ wheels = [
 
 [[package]]
 name = "uv"
-version = "0.8.5"
+version = "0.8.6"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/83/94/e18a40fe6f6d724c1fbf2c9328806359e341710b2fd42dc928a1a8fc636b/uv-0.8.5.tar.gz", hash = "sha256:078cf2935062d5b61816505f9d6f30b0221943a1433b4a1de8f31a1dfe55736b", size = 3451272, upload-time = "2025-08-05T20:50:21.159Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/b9/78cde56283b6b9a8a84b0bf9334442ed75a843310229aaf7f1a71fe67818/uv-0.8.5-py3-none-linux_armv6l.whl", hash = "sha256:e236372a260e312aef5485a0e5819a0ec16c9197af06d162ad5a3e8bd62f9bba", size = 18146198, upload-time = "2025-08-05T20:49:18.859Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/83/5deda1a19362ce426da7f9cc4764a0dd57e665ecbaddd9900d4200bc10ab/uv-0.8.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:53a40628329e543a5c5414553f5898131d5c1c6f963708cb0afc2ecf3e8d8167", size = 18242690, upload-time = "2025-08-05T20:49:23.409Z" },
-    { url = "https://files.pythonhosted.org/packages/06/6e/80b08ee544728317d9c8003d4c10234007e12f384da1c3dfe579489833c9/uv-0.8.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:43a689027696bc9c62e6da3f06900c52eafc4debbf4fba9ecb906196730b34c8", size = 16913881, upload-time = "2025-08-05T20:49:26.631Z" },
-    { url = "https://files.pythonhosted.org/packages/34/f6/47a44dabfc25b598ea6f2ab9aa32ebf1cbd87ed8af18ccde6c5d36f35476/uv-0.8.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:a34d783f5cef00f1918357c0cd9226666e22640794e9e3862820abf4ee791141", size = 17527439, upload-time = "2025-08-05T20:49:30.464Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/7d/ee7c2514e064412133ee9f01c4c42de20da24617b8c25d81cf7021b774d8/uv-0.8.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2140383bc25228281090cc34c00500d8e5822877c955f691d69bbf967e8efa73", size = 17833275, upload-time = "2025-08-05T20:49:33.783Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/e7/5233cf5cbcca8ea65aa1f1e48bf210dc9773fb86b8104ffbc523be7f6a3f/uv-0.8.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b449779ff463b059504dc30316a634f810149e02482ce36ea35daea8f6ce7af", size = 18568916, upload-time = "2025-08-05T20:49:37.031Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/54/6cabb2a0347c51c8366ca3bffeeebd7f829a15f6b29ad20f51fd5ca9c4bd/uv-0.8.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a7f8739d05cc513eee2f1f8a7e6c482a9c1e8860d77cd078d1ea7c3fe36d7a65", size = 19993334, upload-time = "2025-08-05T20:49:40.361Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/7a/b84d994d52f20bc56229840c31e77aff4653e5902ea7b7c2616e9381b5b8/uv-0.8.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:62ebbd22f780ba2585690332765caf9e29c9758e48a678148e8b1ea90580cdb9", size = 19643358, upload-time = "2025-08-05T20:49:43.955Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/f1/7552f2bea528456d34bc245f2959ce910631e01571c4b7ea421ead9a9fc6/uv-0.8.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4f8dd0555f05d66ff46fdab551137cc2b1ea9c5363358913e2af175e367f4398", size = 18947757, upload-time = "2025-08-05T20:49:47.381Z" },
-    { url = "https://files.pythonhosted.org/packages/57/9b/46aadd186a1e16a23cd0701dda0e640197db49a3add074a47231fed45a4f/uv-0.8.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38c04408ad5eae7a178a1e3b0e09afeb436d0c97075530a3c82de453b78d0448", size = 18906135, upload-time = "2025-08-05T20:49:50.985Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/31/6661adedaba9ebac8bb449ec9901f8cbf124fa25e0db3a9e6cf3053cee88/uv-0.8.5-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:73e772caf7310af4b21eaf8c25531b934391f1e84f3afa8e67822d7c432f6dad", size = 17787943, upload-time = "2025-08-05T20:49:54.59Z" },
-    { url = "https://files.pythonhosted.org/packages/11/f2/73fb5c3156fdae830b83edec2f430db84cb4bc4b78f61d21694bd59004cb/uv-0.8.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:3ddd7d8c01073f23ba2a4929ab246adb30d4f8a55c5e007ad7c8341f7bf06978", size = 18675864, upload-time = "2025-08-05T20:49:57.87Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/29/774c6f174c53d68ae9a51c2fabf1b09003b93a53c24591a108be0dc338d7/uv-0.8.5-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:7d601f021cbc179320ea3a75cd1d91bd49af03d2a630c4d04ebd38ff6b87d419", size = 17808770, upload-time = "2025-08-05T20:50:01.566Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/b0/5d164ce84691f5018c5832e9e3371c0196631b1f1025474a179de1d6a70a/uv-0.8.5-py3-none-musllinux_1_1_i686.whl", hash = "sha256:6ee97b7299990026619c20e30e253972c6c0fb6fba4f5658144e62aa1c07785a", size = 18076516, upload-time = "2025-08-05T20:50:04.94Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/73/4d8baefb4f4b07df6a4db7bbd604cb361d4f5215b94d3f66553ea26edfd4/uv-0.8.5-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:09804055d6346febf0767767c04bdd2fab7d911535639f9c18de2ea744b2954c", size = 19031195, upload-time = "2025-08-05T20:50:08.211Z" },
-    { url = "https://files.pythonhosted.org/packages/44/2a/3d074391df2c16c79fc6bf333e4bde75662e64dac465050a03391c75b289/uv-0.8.5-py3-none-win32.whl", hash = "sha256:6362a2e1fa535af0e4c0a01f83e666a4d5f9024d808f9e64e3b6ef07c97aff54", size = 18026273, upload-time = "2025-08-05T20:50:11.868Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/2f/e850d3e745ccd1125b7a48898421824700fd3e996d27d835139160650124/uv-0.8.5-py3-none-win_amd64.whl", hash = "sha256:dd89836735860461c3a5563731e77c011d1831f14ada540f94bf1a7011dbea14", size = 19822158, upload-time = "2025-08-05T20:50:15.428Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/df/e5565b3faf2c6147a877ab7e96ef31e2333f08c5138a98ce77003b1bf65e/uv-0.8.5-py3-none-win_arm64.whl", hash = "sha256:37c1a22915392014d8b4ade9e69e157c8e5ccdf32f37070a84f749a708268335", size = 18430102, upload-time = "2025-08-05T20:50:18.785Z" },
+    { url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
+    { url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
+    { url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
+    { url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
+    { url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
+    { url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
+    { url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
+    { url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
+    { url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
+    { url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
 ]