Initial commit

2025-08-07 18:29:12 +01:00 · 2025-08-07 18:29:12 +01:00 · ef7df1a8c3
commit ef7df1a8c3
28 changed files with 6829 additions and 0 deletions
--- a/helpers/services/init.py
+++ b/helpers/services/init.py
@ -0,0 +1,20 @@
+"""Service layer for llm-gguf-tools.
+
+Provides high-level service interfaces for interacting with external systems
+including HuggingFace, llama.cpp, and filesystem operations. Uses UK English
+spelling conventions throughout.
+"""
+
+from __future__ import annotations
+
+from helpers.services.filesystem import FilesystemService
+from helpers.services.huggingface import HuggingFaceService, ReadmeGenerator
+from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
+
+__all__ = [
+    "EnvironmentManager",
+    "FilesystemService",
+    "HuggingFaceService",
+    "IMatrixGenerator",
+    "ReadmeGenerator",
+]
--- a/helpers/services/filesystem.py
+++ b/helpers/services/filesystem.py
@ -0,0 +1,174 @@
+"""Filesystem operations service.
+
+Provides unified filesystem operations including file discovery, size
+calculation, and path management. Consolidates common filesystem patterns
+used across quantisation and conversion workflows.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+from pathlib import Path
+from typing import Any
+
+from helpers.logger import logger
+
+BYTES_PER_UNIT = 1024.0
+
+
+class FilesystemService:
+    """Handles filesystem operations with consistent error handling.
+
+    Provides methods for file discovery, size formatting, and JSON loading
+    with proper error handling and logging. Ensures consistent behaviour
+    across different tools and workflows.
+    """
+
+    @staticmethod
+    def get_file_size(file_path: Path) -> str:
+        """Get human-readable file size using system utilities.
+
+        Attempts to use `du -h` for human-readable output, falling back to
+        Python calculation if the system command fails. Provides consistent
+        size formatting across the toolset.
+
+        Returns:
+            Human-readable file size string (e.g., "1.5G", "750M").
+        """
+        try:
+            result = subprocess.run(
+                ["du", "-h", str(file_path)], capture_output=True, text=True, check=True
+            )
+            return result.stdout.split()[0]
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            # Fallback to Python calculation
+
+            try:
+                size_bytes: float = float(file_path.stat().st_size)
+                for unit in ["B", "K", "M", "G", "T"]:
+                    if size_bytes < BYTES_PER_UNIT:
+                        return f"{size_bytes:.1f}{unit}"
+                    size_bytes /= BYTES_PER_UNIT
+            except Exception:
+                return "Unknown"
+            else:
+                return f"{size_bytes:.1f}P"
+
+    @staticmethod
+    def load_json_config(config_path: Path) -> dict[str, Any]:
+        """Load and parse JSON configuration file.
+
+        Provides consistent JSON loading with proper error handling and
+        encoding specification. Used for loading model configurations,
+        tokeniser settings, and other JSON-based metadata.
+
+        Returns:
+            Parsed JSON content as dictionary.
+
+        Raises:
+            FileNotFoundError: If config file doesn't exist.
+        """
+        if not config_path.exists():
+            msg = f"Configuration file not found: {config_path}"
+            raise FileNotFoundError(msg)
+
+        with Path(config_path).open(encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def find_safetensor_files(model_path: Path) -> list[Path]:
+        """Find all SafeTensor files in model directory using priority search.
+
+        Searches for tensor files in order of preference: single model.safetensors,
+        sharded model-*-of-*.safetensors files, then any *.safetensors files. This
+        approach handles both single-file and multi-shard model distributions whilst
+        ensuring predictable file ordering for conversion consistency.
+
+        Returns:
+            List of SafeTensor file paths in priority order.
+
+        Raises:
+            FileNotFoundError: If no SafeTensor files are found.
+        """
+        # Check for single file
+        single_file = model_path / "model.safetensors"
+        if single_file.exists():
+            return [single_file]
+
+        # Check for sharded files
+        pattern = "model-*-of-*.safetensors"
+        sharded_files = sorted(model_path.glob(pattern))
+        if sharded_files:
+            return sharded_files
+
+        # Check for any safetensor files
+        any_files = sorted(model_path.glob("*.safetensors"))
+        if any_files:
+            return any_files
+
+        msg = f"No SafeTensor files found in {model_path}"
+        raise FileNotFoundError(msg)
+
+    @staticmethod
+    def find_gguf_files(model_path: Path, pattern: str | None = None) -> list[Path]:
+        """Find GGUF files in directory, optionally filtered by pattern.
+
+        Searches for GGUF files with optional pattern matching. Prioritises
+        multi-part files (00001-of-*) over single files for proper handling
+        of large models split across multiple files.
+
+        Returns:
+            List of GGUF file paths, sorted with multi-part files first.
+        """
+        if pattern:
+            gguf_files = list(model_path.glob(f"*{pattern}*.gguf"))
+        else:
+            gguf_files = list(model_path.glob("*.gguf"))
+
+        # Sort to prioritise 00001-of-* files
+        gguf_files.sort(
+            key=lambda x: (
+                "00001-of-" not in x.name,  # False sorts before True
+                x.name,
+            )
+        )
+
+        return gguf_files
+
+    @staticmethod
+    def ensure_directory(path: Path) -> Path:
+        """Ensure directory exists, creating if necessary.
+
+        Creates directory and all parent directories if they don't exist.
+        Returns the path for method chaining convenience.
+
+        Returns:
+            The directory path.
+        """
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    @staticmethod
+    def cleanup_directory(path: Path, pattern: str = "*") -> int:
+        """Remove files matching pattern from directory.
+
+        Safely removes files matching the specified glob pattern. Returns
+        count of files removed for logging purposes.
+
+        Returns:
+            Number of files removed.
+        """
+        if not path.exists():
+            return 0
+
+        files_removed = 0
+        for file_path in path.glob(pattern):
+            if file_path.is_file():
+                try:
+                    file_path.unlink()
+                    files_removed += 1
+                except Exception as e:
+                    logger.warning(f"Failed to remove {file_path}: {e}")
+
+        return files_removed
--- a/helpers/services/gguf.py
+++ b/helpers/services/gguf.py
@ -0,0 +1,210 @@
+"""GGUF file operations service.
+
+Provides unified interface for creating, writing, and manipulating GGUF files.
+Consolidates GGUF-specific operations from conversion and quantisation workflows.
+Uses UK English spelling conventions throughout.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import gguf
+import torch
+from safetensors import safe_open
+
+from helpers.logger import logger
+from helpers.services.filesystem import FilesystemService
+from helpers.utils.config_parser import ConfigParser
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import numpy as np
+
+    from helpers.models.conversion import ModelConfig
+
+
+class GGUFWriter:
+    """Manages GGUF file creation and metadata writing.
+
+    Provides high-level interface for GGUF file operations including metadata
+    configuration, tensor addition, and tokeniser integration. Encapsulates
+    low-level GGUF library interactions for consistent error handling.
+    """
+
+    def __init__(self, output_path: Path, architecture: str) -> None:
+        """Initialise GGUF writer with output path and architecture.
+
+        Creates the underlying GGUF writer instance and prepares for metadata
+        and tensor addition. Sets up the file structure for the specified
+        model architecture.
+        """
+        self.output_path = output_path
+        self.architecture = architecture
+        self.writer = gguf.GGUFWriter(str(output_path), architecture)
+        logger.info(f"Created GGUF writer for {architecture} architecture")
+
+    def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
+        """Add comprehensive metadata from model configuration.
+
+        Writes general model information, architectural parameters, and
+        quantisation settings to the GGUF file header. Handles both standard
+        and vision model configurations with appropriate parameter mapping.
+        """
+        # General metadata
+        self.writer.add_name(model_name)
+        self.writer.add_description(f"Converted from {model_config.architectures[0]}")
+        self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
+
+        # Model parameters from config
+        params = model_config.to_gguf_params()
+        self.writer.add_context_length(params.context_length)
+        self.writer.add_embedding_length(params.embedding_length)
+        self.writer.add_block_count(params.block_count)
+        self.writer.add_feed_forward_length(params.feed_forward_length)
+        self.writer.add_head_count(params.attention_head_count)
+        self.writer.add_head_count_kv(params.attention_head_count_kv)
+        self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
+        self.writer.add_rope_freq_base(params.rope_freq_base)
+        self.writer.add_rope_dimension_count(params.rope_dimension_count)
+
+        logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
+
+    def add_vision_metadata(self, vision_config: Any) -> None:
+        """Add vision model parameters to GGUF metadata.
+
+        Configures vision-specific parameters for multimodal models including
+        embedding dimensions, attention heads, and spatial processing settings.
+        """
+        if not vision_config:
+            return
+
+        logger.info("Adding vision model parameters...")
+        self.writer.add_vision_embedding_length(vision_config.hidden_size)
+        self.writer.add_vision_block_count(vision_config.num_hidden_layers)
+        self.writer.add_vision_head_count(vision_config.num_attention_heads)
+        self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
+        self.writer.add_vision_patch_size(vision_config.patch_size)
+        self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
+
+        if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
+            self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
+
+    def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
+        """Add tokeniser metadata to GGUF file.
+
+        Writes special token IDs and tokeniser model type to enable proper
+        text processing during inference. Uses sensible defaults for missing
+        configuration values.
+        """
+        self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
+        self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
+        self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
+        self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
+        self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
+
+        logger.info("Added tokeniser configuration")
+
+    def add_tensor(self, name: str, data: np.ndarray) -> None:
+        """Add a tensor to the GGUF file.
+
+        Writes tensor data with the specified name to the file. Handles
+        data type conversions and validates tensor shapes.
+        """
+        self.writer.add_tensor(name, data)
+
+    def finalise(self) -> None:
+        """Write all data to file and close writer.
+
+        Completes the GGUF file creation by writing headers, key-value data,
+        and tensor data in the correct order. Ensures proper file closure.
+        """
+        logger.info(f"Writing GGUF file to {self.output_path}")
+        self.writer.write_header_to_file()
+        self.writer.write_kv_data_to_file()
+        self.writer.write_tensors_to_file()
+        self.writer.close()
+        logger.info("GGUF file written successfully")
+
+
+class GGUFConverter:
+    """High-level GGUF conversion orchestrator.
+
+    Coordinates the complete conversion workflow from source models to GGUF
+    format, managing metadata extraction, tensor mapping, and file writing.
+    """
+
+    @staticmethod
+    def convert_safetensors(
+        model_path: Path,
+        output_path: Path,
+        model_config: ModelConfig,
+        architecture: str,
+        tensor_mapper: Any,
+    ) -> bool:
+        """Convert SafeTensors model to GGUF format.
+
+        Orchestrates the conversion process including metadata setup, tensor
+        loading with BFloat16 support, name mapping, and tokeniser integration.
+
+        Returns:
+            True if conversion successful, False otherwise.
+        """
+        logger.info(f"Converting {model_path.name} to GGUF...")
+
+        # Create writer
+        writer_wrapper = GGUFWriter(output_path, architecture)
+
+        # Add metadata
+        writer_wrapper.add_metadata(model_config, model_path.name)
+
+        # Add vision metadata if present
+        if model_config.vision_config:
+            writer_wrapper.add_vision_metadata(model_config.vision_config)
+
+        # Load and add tensors
+        fs = FilesystemService()
+        tensor_files = fs.find_safetensor_files(model_path)
+        logger.info(f"Found {len(tensor_files)} tensor file(s)")
+
+        tensor_count = 0
+        for tensor_file in tensor_files:
+            logger.info(f"Loading {tensor_file.name}...")
+            with safe_open(tensor_file, framework="pt") as f:
+                for tensor_name in f:
+                    tensor_data = f.get_tensor(tensor_name)
+
+                    # Convert BFloat16 to Float32
+                    if hasattr(tensor_data, "numpy"):
+                        if torch and tensor_data.dtype == torch.bfloat16:
+                            tensor_data = tensor_data.float()
+                        tensor_data = tensor_data.numpy()
+
+                    # Map tensor name
+                    gguf_name = tensor_mapper.map_tensor_name(tensor_name)
+
+                    if gguf_name:
+                        writer_wrapper.add_tensor(gguf_name, tensor_data)
+                        tensor_count += 1
+
+                        if tensor_count % 100 == 0:
+                            logger.info(f"  Processed {tensor_count} tensors...")
+
+        logger.info(f"Total tensors processed: {tensor_count}")
+
+        # Add tokeniser
+        try:
+            tok_config = ConfigParser.load_tokeniser_config(model_path)
+            writer_wrapper.add_tokeniser(tok_config)
+            logger.info("Tokeniser added")
+        except Exception as e:
+            logger.warning(f"Could not add tokeniser: {e}")
+
+        # Finalise file
+        writer_wrapper.finalise()
+
+        file_size = fs.get_file_size(output_path)
+        logger.info(f"Conversion complete! Output: {output_path} ({file_size})")
+
+        return True
--- a/helpers/services/huggingface.py
+++ b/helpers/services/huggingface.py
@ -0,0 +1,454 @@
+"""HuggingFace operations service.
+
+Handles all interactions with HuggingFace including model downloads,
+uploads, README generation, and repository management. Uses UK English
+spelling conventions throughout.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from helpers.logger import logger
+from helpers.models.quantisation import QuantisationType
+
+if TYPE_CHECKING:
+    from helpers.models.quantisation import ModelSource, QuantisationResult
+
+
+class HuggingFaceService:
+    """Manages HuggingFace repository operations.
+
+    Provides methods for downloading models, uploading files, and managing
+    repositories. Handles authentication, error recovery, and progress tracking
+    for robust interaction with HuggingFace services.
+    """
+
+    @staticmethod
+    def get_username() -> str:
+        """Get authenticated HuggingFace username.
+
+        Retrieves the current user's HuggingFace username using the CLI.
+        Requires prior authentication via `huggingface-cli login`.
+
+        Returns:
+            HuggingFace username.
+
+        Raises:
+            RuntimeError: If not authenticated or CLI not available.
+        """
+        try:
+            result = subprocess.run(
+                ["huggingface-cli", "whoami"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except (subprocess.CalledProcessError, FileNotFoundError) as err:
+            msg = "Please log in to HuggingFace first: huggingface-cli login"
+            raise RuntimeError(msg) from err
+
+    @staticmethod
+    def download_model(
+        model_name: str, output_dir: Path, include_pattern: str | None = None
+    ) -> None:
+        """Download model from HuggingFace.
+
+        Downloads a complete model or specific files matching a pattern.
+        Creates the output directory if it doesn't exist. Supports filtered
+        downloads for efficient bandwidth usage when only certain files are needed.
+        """
+        logger.info(f"Downloading {model_name} to {output_dir}")
+
+        cmd = [
+            "huggingface-cli",
+            "download",
+            model_name,
+            "--local-dir",
+            str(output_dir),
+        ]
+
+        if include_pattern:
+            cmd.extend(["--include", include_pattern])
+
+        subprocess.run(cmd, check=True)
+        logger.info("Download complete")
+
+    @staticmethod
+    def upload_file(
+        repo_id: str,
+        local_path: Path,
+        repo_path: str | None = None,
+        create_repo: bool = False,
+    ) -> None:
+        """Upload a file to HuggingFace repository.
+
+        Uploads a single file to the specified repository path. Can create
+        the repository if it doesn't exist. Handles repository creation conflicts
+        gracefully by retrying without the create flag when needed.
+
+        Raises:
+            CalledProcessError: If upload fails.
+        """
+        repo_path = repo_path or local_path.name
+        logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}")
+
+        cmd = [
+            "huggingface-cli",
+            "upload",
+            repo_id,
+            str(local_path),
+            repo_path,
+        ]
+
+        if create_repo:
+            cmd.append("--create")
+
+        try:
+            subprocess.run(cmd, check=True, capture_output=True)
+            logger.info(f"Uploaded {repo_path}")
+        except subprocess.CalledProcessError:
+            if create_repo:
+                # Repository might already exist, retry without --create
+                cmd = cmd[:-1]  # Remove --create flag
+                subprocess.run(cmd, check=True)
+                logger.info(f"Updated {repo_path}")
+            else:
+                raise
+
+
+class ReadmeGenerator:
+    """Generates README files for quantised models.
+
+    Creates comprehensive README documentation including model cards,
+    quantisation details, and status tracking. Supports both initial
+    planning documentation and final result summaries.
+    """
+
+    def generate(
+        self,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        models_dir: Path,
+        output_repo: str | None = None,
+    ) -> Path:
+        """Generate README file for quantised model repository.
+
+        Creates a comprehensive README with frontmatter, quantisation table,
+        and original model information. Handles status tracking for planned,
+        processing, and completed quantisations.
+
+        Returns:
+            Path to generated README file.
+        """
+        logger.info("Creating model card...")
+
+        model_dir = models_dir / model_source.model_name
+        readme_path = model_dir / "README.md"
+
+        # Get original README content
+        original_content = self._get_original_readme(model_source, model_dir)
+
+        # Generate new README
+        readme_content = self._generate_readme_content(
+            model_source, results, original_content, output_repo
+        )
+
+        readme_path.write_text(readme_content)
+        return readme_path
+
+    def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
+        """Extract original README and metadata.
+
+        Downloads or reads the original model's README for inclusion in the
+        quantised model documentation. Parses YAML frontmatter if present.
+
+        Returns:
+            Dictionary with readme content, licence, tags, and frontmatter.
+        """
+        content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
+
+        # Try local file first
+        readme_path = model_dir / "README.md"
+        if readme_path.exists():
+            content["readme"] = readme_path.read_text(encoding="utf-8")
+            logger.info(f"Found original README ({len(content['readme'])} characters)")
+        else:
+            # Download separately
+            content = self._download_readme(model_source)
+
+        # Parse frontmatter if present
+        if content["readme"].startswith("---\n"):
+            content = self._parse_frontmatter(content["readme"])
+
+        return content
+
+    def _download_readme(self, model_source: ModelSource) -> dict[str, str]:
+        """Download README from HuggingFace repository.
+
+        Attempts to download just the README.md file from the source repository
+        for efficient documentation extraction.
+
+        Returns:
+            Dictionary with readme content and default metadata.
+        """
+        content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""}
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            try:
+                logger.info(f"Downloading README from {model_source.source_model}...")
+                subprocess.run(
+                    [
+                        "huggingface-cli",
+                        "download",
+                        model_source.source_model,
+                        "--include",
+                        "README.md",
+                        "--local-dir",
+                        temp_dir,
+                    ],
+                    check=True,
+                    capture_output=True,
+                )
+
+                readme_path = Path(temp_dir) / "README.md"
+                if readme_path.exists():
+                    content["readme"] = readme_path.read_text(encoding="utf-8")
+                    logger.info(f"Downloaded README ({len(content['readme'])} characters)")
+            except subprocess.CalledProcessError as e:
+                logger.warning(f"Failed to download README: {e}")
+
+        return content
+
+    def _parse_frontmatter(self, readme_text: str) -> dict[str, str]:
+        """Parse YAML frontmatter from README.
+
+        Extracts metadata from YAML frontmatter including licence, tags,
+        and other model card fields.
+
+        Returns:
+            Dictionary with separated content and metadata.
+        """
+        lines = readme_text.split("\n")
+        if lines[0] != "---":
+            return {
+                "readme": readme_text,
+                "licence": "apache-2.0",
+                "tags": "",
+                "frontmatter": "",
+            }
+
+        frontmatter_end = -1
+        for i, line in enumerate(lines[1:], 1):
+            if line == "---":
+                frontmatter_end = i
+                break
+
+        if frontmatter_end == -1:
+            return {
+                "readme": readme_text,
+                "licence": "apache-2.0",
+                "tags": "",
+                "frontmatter": "",
+            }
+
+        frontmatter = "\n".join(lines[1:frontmatter_end])
+        content = "\n".join(lines[frontmatter_end + 1 :])
+
+        # Extract licence
+        licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE)
+        licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0"
+
+        # Extract tags
+        tags = []
+        in_tags = False
+        for line in frontmatter.split("\n"):
+            if line.startswith("tags:"):
+                in_tags = True
+                continue
+            if in_tags:
+                if line.startswith("- "):
+                    tags.append(line[2:].strip())
+                elif line and not line.startswith(" "):
+                    break
+
+        return {
+            "readme": content,
+            "licence": licence_val,
+            "tags": ",".join(tags),
+            "frontmatter": frontmatter,
+        }
+
+    def _generate_readme_content(
+        self,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        original_content: dict[str, str],
+        output_repo: str | None = None,
+    ) -> str:
+        """Generate complete README content with quantisation details.
+
+        Creates the full README including YAML frontmatter, quantisation status
+        table, and original model information.
+
+        Returns:
+            Complete README markdown content.
+        """
+        # Build tags
+        our_tags = [
+            "quantised",
+            "gguf",
+            "q4_k_m",
+            "q4_k_l",
+            "q4_k_xl",
+            "q4_k_xxl",
+            "bartowski-method",
+        ]
+        original_tags = original_content["tags"].split(",") if original_content["tags"] else []
+        all_tags = sorted(set(our_tags + original_tags))
+
+        # Build frontmatter
+        frontmatter = f"""---
+license: {original_content["licence"]}
+library_name: gguf
+base_model: {model_source.source_model}
+tags:
+"""
+        for tag in all_tags:
+            if tag.strip():
+                frontmatter += f"- {tag.strip()}\n"
+
+        frontmatter += "---\n\n"
+
+        # Build main content
+        hf_url = f"https://huggingface.co/{model_source.source_model}"
+        content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF
+
+GGUF quantisations of [{model_source.source_model}]({hf_url}) using Bartowski's method.
+
+| Quantisation | Embeddings/Output | Attention | Feed-Forward | Status |
+|--------------|-------------------|-----------|--------------|--------|
+"""
+
+        # Add results table
+        for quant_type in [
+            QuantisationType.Q4_K_M,
+            QuantisationType.Q4_K_L,
+            QuantisationType.Q4_K_XL,
+            QuantisationType.Q4_K_XXL,
+        ]:
+            result = results.get(quant_type)
+            if not result:
+                result = type("Result", (), {"status": "planned", "success": False})()
+
+            layers = self._get_layers_config(quant_type)
+            status = self._format_status(result, model_source, quant_type, output_repo)
+
+            content += (
+                f"| {quant_type.value} | {layers['embeddings']} | "
+                f"{layers['attention']} | {layers['ffn']} | {status} |\n"
+            )
+
+        content += "\n---\n\n"
+
+        # Add original content
+        if original_content["readme"]:
+            content += "# Original Model Information\n\n" + original_content["readme"]
+        else:
+            content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model}).\n"
+
+        return frontmatter + content
+
+    def _get_layers_config(self, quant_type: QuantisationType) -> dict[str, str]:
+        """Get layer configuration for quantisation type.
+
+        Returns layer precision specifications for the quantisation table.
+
+        Returns:
+            Dictionary with embeddings, attention, and ffn precision labels.
+        """
+        configs = {
+            QuantisationType.Q4_K_M: {
+                "embeddings": "Q4_K_M",
+                "attention": "Q4_K_M",
+                "ffn": "Q4_K_M",
+            },
+            QuantisationType.Q4_K_L: {"embeddings": "Q6_K", "attention": "Q6_K", "ffn": "Q4_K_M"},
+            QuantisationType.Q4_K_XL: {"embeddings": "Q8_0", "attention": "Q6_K", "ffn": "Q4_K_M"},
+            QuantisationType.Q4_K_XXL: {"embeddings": "Q8_0", "attention": "Q8_0", "ffn": "Q4_K_M"},
+        }
+        return configs.get(
+            quant_type, {"embeddings": "Unknown", "attention": "Unknown", "ffn": "Unknown"}
+        )
+
+    def _format_status(
+        self,
+        result: QuantisationResult,
+        model_source: ModelSource,
+        quant_type: QuantisationType,
+        output_repo: str | None,
+    ) -> str:
+        """Format status indicator for README table.
+
+        Creates appropriate status indicator based on quantisation state
+        including progress indicators, file sizes, and download links.
+
+        Returns:
+            Formatted status string for table cell.
+        """
+        status_map = {
+            "planned": "⏳ Planned",
+            "processing": "🔄 Processing...",
+            "uploading": "⬆️ Uploading...",
+            "failed": "❌ Failed",
+        }
+
+        if hasattr(result, "status") and result.status in status_map:
+            base_status = status_map[result.status]
+
+            if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
+                return f"{base_status} ({result.file_size})"
+            if result.status == "completed" or (hasattr(result, "success") and result.success):
+                return self._format_success_status(result, model_source, quant_type, output_repo)
+            return base_status
+
+        # Legacy support
+        if hasattr(result, "success") and result.success:
+            return self._format_success_status(result, model_source, quant_type, output_repo)
+        return "❌ Failed"
+
+    def _format_success_status(
+        self,
+        result: QuantisationResult,
+        model_source: ModelSource,
+        quant_type: QuantisationType,
+        output_repo: str | None,
+    ) -> str:
+        """Format successful quantisation status with download link.
+
+        Creates a download link if repository information is available,
+        otherwise shows file size.
+
+        Returns:
+            Formatted success status string.
+        """
+        if not output_repo:
+            return (
+                f"✅ {result.file_size}"
+                if hasattr(result, "file_size") and result.file_size
+                else "✅ Available"
+            )
+
+        filename = (
+            f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf"
+        )
+        url = f"https://huggingface.co/{output_repo}?show_file_info={filename}"
+
+        if hasattr(result, "file_size") and result.file_size:
+            return f"[✅ {result.file_size}]({url})"
+        return f"[✅ Available]({url})"
--- a/helpers/services/llama_cpp.py
+++ b/helpers/services/llama_cpp.py
@ -0,0 +1,417 @@
+"""llama.cpp environment and operations service.
+
+Manages llama.cpp binary discovery, environment setup, and imatrix generation.
+Provides consistent interface for interacting with llama.cpp tools across
+different installation methods.
+"""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+from helpers.logger import logger
+from helpers.models.quantisation import LlamaCppEnvironment
+from helpers.services.filesystem import FilesystemService
+
+
+class EnvironmentManager:
+    """Manages llama.cpp environment setup and binary discovery.
+
+    Handles detection of local binaries, repository setup, and conversion
+    script location. Provides fallback strategies for different installation
+    scenarios including local builds and repository-based setups.
+    """
+
+    def __init__(self, work_dir: Path) -> None:
+        """Initialise EnvironmentManager."""
+        self.work_dir = work_dir
+        self.llama_cpp_dir = work_dir / "llama.cpp"
+        self.fs = FilesystemService()
+
+    def setup(self) -> LlamaCppEnvironment:
+        """Set up llama.cpp environment with automatic detection.
+
+        Checks for local llama.cpp binaries first, then falls back to
+        repository-based setup if needed. Handles conversion script location,
+        dependency installation, and path resolution.
+
+        Returns:
+            Configured LlamaCppEnvironment instance.
+        """
+        # Check for local binaries first
+        local_env = self._check_local_binaries()
+        if local_env:
+            return local_env
+
+        # Setup repository if needed
+        return self.setup_repository()
+
+    def _check_local_binaries(self) -> LlamaCppEnvironment | None:
+        """Check for existing llama.cpp binaries in current directory.
+
+        Searches for quantise and CLI binaries in the current directory
+        and standard installation paths. Also locates conversion scripts.
+
+        Returns:
+            LlamaCppEnvironment if binaries found, None otherwise.
+        """
+        quantise_bin = Path("./llama-quantize")
+        cli_bin = Path("./llama-cli")
+
+        if not (quantise_bin.exists() and cli_bin.exists()):
+            return None
+
+        logger.info("Found llama.cpp binaries in current directory")
+
+        # Check for conversion script
+        convert_script = self._find_convert_script()
+        if convert_script:
+            logger.info(f"Found conversion script: {convert_script}")
+            return LlamaCppEnvironment(
+                quantise_binary=quantise_bin.resolve(),
+                cli_binary=cli_bin.resolve(),
+                convert_script=convert_script,
+                use_repo=False,
+            )
+
+        logger.warning("No conversion script found in current directory")
+        logger.info("Will use llama.cpp repository method for conversion")
+        return LlamaCppEnvironment(
+            quantise_binary=quantise_bin.resolve(),
+            cli_binary=cli_bin.resolve(),
+            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
+            use_repo=True,
+        )
+
+    def _find_convert_script(self) -> str | None:
+        """Find conversion script in current directory.
+
+        Searches for various naming conventions of the HF to GGUF
+        conversion script.
+
+        Returns:
+            Command to run conversion script, or None if not found.
+        """
+        scripts = [
+            "./llama-convert-hf-to-gguf",
+            "python3 ./convert_hf_to_gguf.py",
+            "python3 ./convert-hf-to-gguf.py",
+        ]
+
+        for script in scripts:
+            if script.startswith("python3"):
+                script_path = script.split(" ", 1)[1]
+                if Path(script_path).exists():
+                    return script
+            elif Path(script).exists():
+                return script
+        return None
+
+    def setup_repository(self) -> LlamaCppEnvironment:
+        """Setup llama.cpp repository for conversion scripts.
+
+        Clones the llama.cpp repository if not present and installs
+        Python dependencies for model conversion.
+
+        Returns:
+            LlamaCppEnvironment configured with repository paths.
+        """
+        if not self.llama_cpp_dir.exists():
+            logger.info("Cloning llama.cpp for conversion script...")
+            subprocess.run(
+                [
+                    "git",
+                    "clone",
+                    "https://github.com/ggerganov/llama.cpp.git",
+                    str(self.llama_cpp_dir),
+                ],
+                check=True,
+            )
+
+            # Install Python requirements
+            logger.info("Installing Python requirements...")
+            subprocess.run(
+                [
+                    "pip3",
+                    "install",
+                    "-r",
+                    "requirements.txt",
+                    "--break-system-packages",
+                    "--root-user-action=ignore",
+                ],
+                cwd=self.llama_cpp_dir,
+                check=True,
+            )
+
+            # Install additional conversion dependencies
+            logger.info("Installing additional conversion dependencies...")
+            subprocess.run(
+                [
+                    "pip3",
+                    "install",
+                    "transformers",
+                    "sentencepiece",
+                    "protobuf",
+                    "--break-system-packages",
+                    "--root-user-action=ignore",
+                ],
+                check=True,
+            )
+        else:
+            logger.info("llama.cpp repository already exists")
+
+        # Use local binaries but repo conversion script
+        return LlamaCppEnvironment(
+            quantise_binary=Path("./llama-quantize").resolve(),
+            cli_binary=Path("./llama-cli").resolve(),
+            convert_script=f"python3 {self.llama_cpp_dir}/convert_hf_to_gguf.py",
+            use_repo=False,
+        )
+
+
+class IMatrixGenerator:
+    """Handles importance matrix generation for quantisation guidance.
+
+    Generates or locates importance matrices that guide quantisation
+    decisions, helping preserve model quality by identifying critical
+    tensors requiring higher precision.
+    """
+
+    def __init__(self) -> None:
+        """Initialise IMatrixGenerator."""
+        self.fs = FilesystemService()
+
+    def generate_imatrix(
+        self, f16_model_path: Path, llama_env: LlamaCppEnvironment, model_dir: Path
+    ) -> Path | None:
+        """Generate importance matrix for quantisation guidance.
+
+        Searches for existing imatrix files first, provides interactive
+        prompts for user-supplied matrices, then generates new matrices
+        using calibration data if necessary.
+
+        Returns:
+            Path to imatrix file, or None if generation fails.
+        """
+        imatrix_path = model_dir / "imatrix.dat"
+
+        # Check for existing imatrix
+        if imatrix_path.exists():
+            logger.info(f"Found existing imatrix: {imatrix_path.name}")
+            return imatrix_path
+
+        # Try user-provided imatrix
+        user_imatrix = self._prompt_for_user_imatrix(model_dir, imatrix_path)
+        if user_imatrix:
+            return user_imatrix
+
+        # Generate new imatrix
+        calibration_file = self._get_calibration_file()
+        if not calibration_file:
+            return None
+
+        return self._generate_new_imatrix(f16_model_path, llama_env, imatrix_path, calibration_file)
+
+    def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
+        """Prompt user for existing imatrix file.
+
+        Returns:
+            Path to user-provided imatrix, or None if not available.
+        """
+        logger.info(f"Model directory: {model_dir}")
+        logger.info(f"Looking for imatrix file at: {imatrix_path}")
+        logger.info(
+            "Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
+        )
+        logger.info(
+            "   Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
+        )
+
+        response = (
+            input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
+            .strip()
+            .lower()
+        )
+
+        if response != "y":
+            return None
+
+        logger.info(f"Please place your imatrix.dat file in: {model_dir}")
+        input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
+
+        if imatrix_path.exists():
+            file_size = self.fs.get_file_size(imatrix_path)
+            logger.info(f"Found imatrix file! ({file_size})")
+            return imatrix_path
+
+        logger.warning("No imatrix.dat file found - continuing with automatic generation")
+        return None
+
+    def _get_calibration_file(self) -> Path | None:
+        """Get calibration data file for imatrix generation.
+
+        Returns:
+            Path to calibration file, or None if not found.
+        """
+        calibration_file = Path(__file__).parent.parent.parent / "resources" / "imatrix_data.txt"
+        if not calibration_file.exists():
+            logger.warning("resources/imatrix_data.txt not found - skipping imatrix generation")
+            logger.info(
+                "Download from: https://gist.githubusercontent.com/bartowski1182/"
+                "eb213dccb3571f863da82e99418f81e8/raw/calibration_datav3.txt"
+            )
+            return None
+        return calibration_file
+
+    def _generate_new_imatrix(
+        self,
+        f16_model_path: Path,
+        llama_env: LlamaCppEnvironment,
+        imatrix_path: Path,
+        calibration_file: Path,
+    ) -> Path | None:
+        """Generate new importance matrix using calibration data.
+
+        Returns:
+            Path to generated imatrix, or None if generation fails.
+        """
+        logger.info("Generating importance matrix (this may take 1-4 hours for large models)...")
+        logger.info(f"Model: {f16_model_path.name}")
+        logger.info(f"Calibration: {calibration_file}")
+        logger.info(f"Output: {imatrix_path}")
+
+        # Find imatrix binary
+        imatrix_binary = self._find_imatrix_binary(llama_env)
+        if not imatrix_binary:
+            logger.warning("llama-imatrix binary not found - skipping imatrix generation")
+            logger.info("Make sure llama-imatrix is in the same directory as llama-quantize")
+            return None
+
+        # Build and execute command
+        cmd = self._build_imatrix_command(
+            imatrix_binary, f16_model_path, calibration_file, imatrix_path
+        )
+        return self._execute_imatrix_generation(cmd, imatrix_path)
+
+    def _build_imatrix_command(
+        self, binary: Path, model_path: Path, calibration_file: Path, output_path: Path
+    ) -> list[str]:
+        """Build imatrix generation command.
+
+        Returns:
+            Command arguments as list.
+        """
+        return [
+            str(binary),
+            "-m",
+            str(model_path),
+            "-f",
+            str(calibration_file),
+            "-o",
+            str(output_path),
+            "--process-output",
+            "--output-frequency",
+            "10",
+            "--save-frequency",
+            "50",
+            "-t",
+            "8",
+            "-c",
+            "2048",
+            "-b",
+            "512",
+        ]
+
+    def _execute_imatrix_generation(self, cmd: list[str], imatrix_path: Path) -> Path | None:
+        """Execute imatrix generation command with real-time output.
+
+        Returns:
+            Path to generated imatrix file, or None if generation fails.
+        """
+        logger.info(f"Running: {' '.join(cmd)}")
+        logger.info("Starting imatrix generation... (progress will be shown)")
+
+        try:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+            )
+
+            self._stream_imatrix_output(process)
+
+            return_code = process.poll()
+            if return_code == 0:
+                return self._validate_imatrix_output(imatrix_path)
+
+        except KeyboardInterrupt:
+            logger.info("imatrix generation cancelled by user")
+            process.terminate()
+            return None
+        except Exception as e:
+            logger.error(f"imatrix generation failed with exception: {e}")
+            return None
+        else:
+            logger.error(f"imatrix generation failed with return code {return_code}")
+            return None
+
+    def _stream_imatrix_output(self, process: subprocess.Popen) -> None:
+        """Stream imatrix generation output in real-time."""
+        while True:
+            if process.stdout is not None:
+                output = process.stdout.readline()
+            else:
+                break
+            if not output and process.poll() is not None:
+                break
+            if output:
+                line = output.strip()
+                if self._should_log_imatrix_line(line):
+                    logger.info(line)
+
+    def _should_log_imatrix_line(self, line: str) -> bool:
+        """Determine if imatrix output line should be logged.
+
+        Returns:
+            True if line should be logged, False otherwise.
+        """
+        keywords = ["Computing imatrix", "perplexity:", "save_imatrix", "entries =", "ETA"]
+        return any(keyword in line for keyword in keywords) or line.startswith("[")
+
+    def _validate_imatrix_output(self, imatrix_path: Path) -> Path | None:
+        """Validate generated imatrix file.
+
+        Returns:
+            Path to imatrix if valid, None otherwise.
+        """
+        if imatrix_path.exists():
+            file_size = self.fs.get_file_size(imatrix_path)
+            logger.info(f"imatrix generation successful! ({file_size})")
+            return imatrix_path
+        logger.error("imatrix generation completed but file not found")
+        return None
+
+    def _find_imatrix_binary(self, llama_env: LlamaCppEnvironment) -> Path | None:
+        """Find llama-imatrix binary in common locations.
+
+        Searches for the imatrix binary in the current directory and
+        standard installation paths.
+
+        Returns:
+            Path to imatrix binary, or None if not found.
+        """
+        candidates = [
+            Path("./llama-imatrix"),
+            llama_env.quantise_binary.parent / "llama-imatrix",
+            Path("/usr/local/bin/llama-imatrix"),
+            Path("/usr/bin/llama-imatrix"),
+        ]
+
+        for candidate in candidates:
+            if candidate.exists() and candidate.is_file():
+                return candidate
+
+        return None
--- a/helpers/services/orchestrator.py
+++ b/helpers/services/orchestrator.py
@ -0,0 +1,397 @@
+"""Quantisation orchestration service.
+
+High-level orchestration of the complete quantisation workflow from model
+acquisition through processing to upload. Manages parallel processing,
+status tracking, and cleanup operations for efficient resource utilisation.
+"""
+
+from __future__ import annotations
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from helpers.config.quantisation_configs import QUANTISATION_CONFIGS, SUPPORTED_QUANTISATION_TYPES
+from helpers.logger import logger
+from helpers.models.quantisation import (
+    ModelSource,
+    QuantisationContext,
+    QuantisationResult,
+    QuantisationType,
+)
+from helpers.services.huggingface import ReadmeGenerator
+from helpers.services.llama_cpp import EnvironmentManager, IMatrixGenerator
+from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
+from helpers.utils.tensor_mapping import URLParser
+
+
+@dataclass(slots=True)
+class QuantisationOrchestrator:
+    """Orchestrates the complete quantisation workflow.
+
+    Uses dataclass with slots for efficient memory usage and dependency injection
+    for modular service interaction following SOLID principles.
+    """
+
+    work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work")
+    use_imatrix: bool = True
+    imatrix_base: str = "Q4_K_M"
+    no_upload: bool = False
+
+    # Service dependencies with factory defaults
+    url_parser: URLParser = field(default_factory=URLParser)
+    quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
+    imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
+    readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
+    uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
+
+    # Computed properties
+    models_dir: Path = field(init=False)
+    environment_manager: EnvironmentManager = field(init=False)
+    model_manager: ModelManager = field(init=False)
+
+    def __post_init__(self) -> None:
+        """Initialise computed properties after dataclass construction."""
+        self.models_dir = self.work_dir / "models"
+        self.environment_manager = EnvironmentManager(self.work_dir)
+        self.model_manager = ModelManager(self.models_dir, self.environment_manager)
+
+    def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]:
+        """Main quantisation workflow orchestrating model processing from URL to upload.
+
+        Returns:
+            dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
+        """
+        logger.info("Starting Bartowski quantisation process...")
+
+        # Setup and preparation
+        model_source, llama_env, f16_model_path, imatrix_path, output_repo = (
+            self._setup_environment(url)
+        )
+
+        # Create initial repository
+        self._create_initial_repository(model_source, output_repo)
+
+        # Execute all quantisations
+        results = self._execute_quantisations(
+            model_source, llama_env, f16_model_path, imatrix_path, output_repo
+        )
+
+        # Cleanup
+        self._cleanup_files(f16_model_path, model_source)
+
+        self._print_completion_summary(model_source, results, output_repo)
+        return results
+
+    def _setup_environment(self, url: str) -> tuple[ModelSource, Any, Path, Path | None, str]:
+        """Setup environment and prepare model for quantisation.
+
+        Returns:
+            Tuple of (model_source, llama_env, f16_model_path, imatrix_path, output_repo).
+        """
+        model_source = self.url_parser.parse(url)
+        self._print_model_info(model_source)
+
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        llama_env = self.environment_manager.setup()
+
+        f16_model_path = self.model_manager.prepare_model(model_source, llama_env)
+
+        imatrix_path = None
+        if self.use_imatrix:
+            logger.info("Generating importance matrix (imatrix)...")
+            imatrix_path = self.imatrix_generator.generate_imatrix(
+                f16_model_path, llama_env, self.models_dir / model_source.model_name
+            )
+
+        output_repo = (
+            f"{self.uploader.get_username()}/"
+            f"{model_source.original_author}-{model_source.model_name}-GGUF"
+        )
+
+        return model_source, llama_env, f16_model_path, imatrix_path, output_repo
+
+    def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
+        """Create initial repository with planned quantisations."""
+        logger.info("Creating initial README with planned quantisations...")
+        planned_results = {
+            qt: QuantisationResult(quantisation_type=qt, success=False, status="planned")
+            for qt in SUPPORTED_QUANTISATION_TYPES
+        }
+        readme_path = self.readme_generator.generate(
+            model_source, planned_results, self.models_dir, output_repo
+        )
+
+        if not self.no_upload:
+            logger.info("Creating repository with planned quantisations...")
+            self.uploader.upload_readme(output_repo, readme_path)
+        else:
+            logger.info("Skipping repository creation (--no-upload specified)")
+
+    def _execute_quantisations(
+        self,
+        model_source: ModelSource,
+        llama_env: Any,
+        f16_model_path: Path,
+        imatrix_path: Path | None,
+        output_repo: str,
+    ) -> dict[QuantisationType, QuantisationResult]:
+        """Execute all quantisation types with parallel uploads.
+
+        Returns:
+            dict[QuantisationType, QuantisationResult]: Quantisation results for each type.
+        """
+        results: dict[QuantisationType, QuantisationResult] = {}
+        upload_futures: list[Future[None]] = []
+
+        with ThreadPoolExecutor(max_workers=1, thread_name_prefix="uploader") as upload_executor:
+            for quant_type in SUPPORTED_QUANTISATION_TYPES:
+                result = self._process_single_quantisation(
+                    quant_type,
+                    model_source,
+                    llama_env,
+                    f16_model_path,
+                    imatrix_path,
+                    output_repo,
+                    results,
+                    upload_executor,
+                    upload_futures,
+                )
+                results[quant_type] = result
+
+            self._wait_for_uploads(upload_futures)
+
+        return results
+
+    def _process_single_quantisation(
+        self,
+        quant_type: QuantisationType,
+        model_source: ModelSource,
+        llama_env: Any,
+        f16_model_path: Path,
+        imatrix_path: Path | None,
+        output_repo: str,
+        results: dict[QuantisationType, QuantisationResult],
+        upload_executor: ThreadPoolExecutor,
+        upload_futures: list,
+    ) -> QuantisationResult:
+        """Process a single quantisation type.
+
+        Returns:
+            QuantisationResult: Result of the quantisation attempt.
+        """
+        try:
+            logger.info(f"Starting {quant_type.value} quantisation...")
+            config = QUANTISATION_CONFIGS[quant_type]
+
+            # Update status to processing
+            result = QuantisationResult(quantisation_type=quant_type, success=False)
+            result.status = "processing"
+            results[quant_type] = result
+
+            self._update_readme_status(model_source, results, output_repo)
+
+            # Perform quantisation
+            context = QuantisationContext(
+                f16_model_path=f16_model_path,
+                model_source=model_source,
+                config=config,
+                llama_env=llama_env,
+                models_dir=self.models_dir,
+                imatrix_path=imatrix_path,
+                base_quant=self.imatrix_base,
+            )
+            result = self.quantisation_engine.quantise(context)
+
+            self._handle_quantisation_result(
+                result,
+                quant_type,
+                model_source,
+                results,
+                output_repo,
+                upload_executor,
+                upload_futures,
+            )
+        except Exception as e:
+            return self._handle_quantisation_error(
+                e, quant_type, model_source, results, output_repo
+            )
+        else:
+            return result
+
+    def _handle_quantisation_result(
+        self,
+        result: QuantisationResult,
+        quant_type: QuantisationType,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        output_repo: str,
+        upload_executor: ThreadPoolExecutor,
+        upload_futures: list,
+    ) -> None:
+        """Handle successful or failed quantisation result."""
+        if result.success and result.file_path:
+            quant_str = getattr(result.quantisation_type, "value", result.quantisation_type)
+            logger.info(f"Starting parallel upload of {quant_str}...")
+            upload_future = upload_executor.submit(
+                self._upload_and_cleanup,
+                output_repo,
+                result.file_path,
+                quant_type,
+                model_source,
+                results,
+            )
+            upload_futures.append(upload_future)
+            result.file_path = None  # Mark as being uploaded
+            result.status = "uploading"
+        else:
+            result.status = "failed"
+
+        self._update_readme_status(model_source, results, output_repo)
+
+    def _handle_quantisation_error(
+        self,
+        error: Exception,
+        quant_type: QuantisationType,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        output_repo: str,
+    ) -> QuantisationResult:
+        """Handle quantisation processing error.
+
+        Returns:
+            QuantisationResult: Failed quantisation result with error information.
+        """
+        logger.error(f"Error processing {quant_type.value}: {error}")
+        result = QuantisationResult(quantisation_type=quant_type, success=False)
+        result.status = "failed"
+        result.error_message = str(error)
+
+        try:
+            self._update_readme_status(model_source, results, output_repo)
+        except Exception as readme_error:
+            logger.error(f"Failed to update README after error: {readme_error}")
+
+        return result
+
+    def _update_readme_status(
+        self,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        output_repo: str,
+    ) -> None:
+        """Update README with current quantisation status."""
+        if not self.no_upload:
+            updated_readme_path = self.readme_generator.generate(
+                model_source, results, self.models_dir, output_repo
+            )
+            self.uploader.upload_readme(output_repo, updated_readme_path)
+
+    def _wait_for_uploads(self, upload_futures: list) -> None:
+        """Wait for all parallel uploads to complete."""
+        logger.info("Waiting for any remaining uploads to complete...")
+        for future in upload_futures:
+            try:
+                future.result(timeout=300)  # 5 minute timeout per upload
+            except Exception as e:
+                logger.warning(f"Upload error: {e}")
+
+    def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
+        """Clean up temporary files after processing."""
+        if f16_model_path.exists():
+            logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...")
+            f16_model_path.unlink()
+
+        if not model_source.is_gguf_repo:
+            self._cleanup_original_model(model_source)
+
+    def _cleanup_original_model(self, model_source: ModelSource) -> None:
+        """Clean up original safetensors/PyTorch files after successful conversion."""
+        model_dir = self.models_dir / model_source.model_name
+
+        pytorch_files = list(model_dir.glob("pytorch_model*.bin"))
+        if pytorch_files:
+            logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...")
+            for file in pytorch_files:
+                file.unlink()
+
+        logger.info("Keeping config files, tokeniser, and metadata for reference")
+
+    def _upload_and_cleanup(
+        self,
+        output_repo: str,
+        file_path: Path,
+        quant_type: QuantisationType,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+    ) -> None:
+        """Upload file and clean up (runs in background thread)."""
+        try:
+            logger.info(f"[PARALLEL] Uploading {quant_type}...")
+            self.uploader.upload_model_file(output_repo, file_path)
+
+            logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...")
+            file_path.unlink()
+
+            results[quant_type].status = "completed"
+            updated_readme_path = self.readme_generator.generate(
+                model_source, results, self.models_dir, output_repo
+            )
+            self.uploader.upload_readme(output_repo, updated_readme_path)
+
+            logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete")
+        except Exception as e:
+            logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}")
+            results[quant_type].status = "failed"
+            results[quant_type].error_message = str(e)
+
+            updated_readme_path = self.readme_generator.generate(
+                model_source, results, self.models_dir, output_repo
+            )
+            self.uploader.upload_readme(output_repo, updated_readme_path)
+            raise
+
+    def _print_model_info(self, model_source: ModelSource) -> None:
+        """Print model information."""
+        logger.info(f"Source URL: {model_source.url}")
+        logger.info(f"Source model: {model_source.source_model}")
+        logger.info(f"Original author: {model_source.original_author}")
+        logger.info(f"Model name: {model_source.model_name}")
+        logger.info(f"Your HF username: {self.uploader.get_username()}")
+        logger.info(f"Working directory: {self.work_dir}")
+
+    def _print_completion_summary(
+        self,
+        model_source: ModelSource,
+        results: dict[QuantisationType, QuantisationResult],
+        output_repo: str,
+    ) -> None:
+        """Print completion summary."""
+        successful_results = [r for r in results.values() if r.success]
+
+        if successful_results:
+            logger.info("Complete! Your quantised models are available at:")
+            logger.info(f"   https://huggingface.co/{output_repo}")
+            logger.info("Model info:")
+            logger.info(f"   - Source URL: {model_source.url}")
+            logger.info(f"   - Original: {model_source.source_model}")
+            logger.info(
+                "   - Method: "
+                f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}"
+            )
+            logger.info(f"   - Quantised: {output_repo}")
+
+            for result in successful_results:
+                if result.file_size:
+                    filename = (
+                        f"{model_source.original_author}-{model_source.model_name}-"
+                        f"{result.quantisation_type}.gguf"
+                    )
+                    logger.info(f"   - {result.quantisation_type}: {filename} ({result.file_size})")
+        else:
+            logger.error(
+                "All quantisations failed - repository created with documentation "
+                "but no model files"
+            )
+            logger.error(f"   Repository: https://huggingface.co/{output_repo}")
--- a/helpers/services/quantisation.py
+++ b/helpers/services/quantisation.py
@ -0,0 +1,486 @@
+"""Quantisation operations service.
+
+Provides modular quantisation engine, model management, and upload capabilities
+for GGUF model processing. Consolidates quantisation logic from various tools
+into reusable components following SOLID principles.
+"""
+
+from __future__ import annotations
+
+import shutil
+import subprocess
+from typing import TYPE_CHECKING
+
+from helpers.logger import logger
+from helpers.models.quantisation import (
+    ModelSource,
+    QuantisationContext,
+    QuantisationResult,
+    QuantisationType,
+)
+from helpers.services.filesystem import FilesystemService
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from helpers.models.quantisation import LlamaCppEnvironment
+    from helpers.services.llama_cpp import EnvironmentManager
+
+
+class QuantisationEngine:
+    """Handles the actual quantisation process with configurable methods.
+
+    Provides flexible quantisation execution supporting multiple tensor
+    precision configurations, importance matrices, and fallback strategies.
+    Encapsulates llama-quantize binary interactions with real-time output.
+    """
+
+    def __init__(self) -> None:
+        """Initialise quantisation engine."""
+        self.fs = FilesystemService()
+
+    def quantise(self, context: QuantisationContext) -> QuantisationResult:
+        """Perform quantisation using the specified configuration.
+
+        Executes quantisation with primary and fallback methods, handling
+        tensor-specific precision overrides and importance matrix guidance.
+
+        Returns:
+            QuantisationResult with success status and file information.
+        """
+        logger.info(
+            f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
+        )
+
+        output_path = context.get_output_path()
+
+        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
+        logger.info(f"📝 Source: {context.f16_model_path}")
+        logger.info(f"📝 Target: {output_path}")
+
+        # Try primary method
+        if self._try_quantisation_method(
+            context, output_path, context.config.tensor_types, "method 1"
+        ):
+            return self._create_success_result(context.config.name, output_path, "method 1")
+
+        # Try fallback methods
+        for i, fallback_method in enumerate(context.config.fallback_methods, 2):
+            method_name = f"method {i}"
+            if self._try_quantisation_method(context, output_path, fallback_method, method_name):
+                return self._create_success_result(context.config.name, output_path, method_name)
+
+        logger.error("All %s quantisation methods failed", context.config.name)
+        return QuantisationResult(
+            quantisation_type=QuantisationType(context.config.name),
+            success=False,
+            error_message="All quantisation methods failed",
+        )
+
+    def _try_quantisation_method(
+        self,
+        context: QuantisationContext,
+        output_path: Path,
+        tensor_config: dict[str, str],
+        method_name: str,
+    ) -> bool:
+        """Try a specific quantisation method with real-time output.
+
+        Builds and executes llama-quantize command with appropriate parameters,
+        streaming output for progress monitoring.
+
+        Returns:
+            True if quantisation successful, False otherwise.
+        """
+        logger.info(f"🔍 Trying {method_name}...")
+
+        cmd = self._build_quantisation_command(context, output_path, tensor_config)
+        return self._execute_quantisation_command(cmd, method_name)
+
+    def _build_quantisation_command(
+        self, context: QuantisationContext, output_path: Path, tensor_config: dict[str, str]
+    ) -> list[str]:
+        """Build quantisation command with all required parameters.
+
+        Returns:
+            List of command arguments.
+        """
+        cmd = [str(context.llama_env.quantise_binary)]
+
+        # Add imatrix if available
+        if context.imatrix_path and context.imatrix_path.exists():
+            cmd.extend(["--imatrix", str(context.imatrix_path)])
+            logger.info(f"🧮 Using imatrix: {context.imatrix_path.name}")
+
+        # Add tensor type arguments
+        self._add_tensor_type_arguments(cmd, tensor_config)
+
+        cmd.extend([str(context.f16_model_path), str(output_path), context.base_quant])
+        return cmd
+
+    def _add_tensor_type_arguments(self, cmd: list[str], tensor_config: dict[str, str]) -> None:
+        """Add tensor type arguments to command."""
+        if not tensor_config:
+            return
+
+        for tensor_name, quant_type in tensor_config.items():
+            if tensor_name.startswith(("token-embedding-type", "output-tensor-type")):
+                cmd.extend([f"--{tensor_name}", quant_type])
+            else:
+                cmd.extend(["--tensor-type", f"{tensor_name}={quant_type}"])
+
+    def _execute_quantisation_command(self, cmd: list[str], method_name: str) -> bool:
+        """Execute quantisation command with real-time output.
+
+        Returns:
+            True if quantisation successful, False otherwise.
+        """
+        logger.info(f"💻 Running: {' '.join(cmd)}")
+        logger.info("⏳ Quantisation in progress... (this may take several minutes)")
+
+        try:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+            )
+
+            self._stream_quantisation_output(process)
+
+            return_code = process.poll()
+            if return_code == 0:
+                logger.info(f"✅ {method_name} quantisation successful!")
+                return True
+        except Exception as e:
+            logger.info(f"❌ {method_name} failed with exception: {e}")
+            return False
+        else:
+            logger.info(f"❌ {method_name} failed with return code {return_code}")
+            return False
+
+    def _stream_quantisation_output(self, process: subprocess.Popen) -> None:
+        """Stream quantisation output in real-time."""
+        while True:
+            if process.stdout is not None:
+                output = process.stdout.readline()
+            else:
+                break
+            if not output and process.poll() is not None:
+                break
+            if output:
+                logger.info(f"📊 {output.strip()}")
+
+    def _create_success_result(
+        self, quant_type: str, output_path: Path, method_used: str
+    ) -> QuantisationResult:
+        """Create successful quantisation result with file metadata.
+
+        Returns:
+            QuantisationResult with file path and size information.
+        """
+        file_size = self.fs.get_file_size(output_path)
+        return QuantisationResult(
+            quantisation_type=QuantisationType(quant_type),
+            success=True,
+            file_path=output_path,
+            file_size=file_size,
+            method_used=method_used,
+        )
+
+
+class ModelManager:
+    """Handles model downloading and preparation for quantisation.
+
+    Manages both GGUF repository downloads and HuggingFace model conversions,
+    providing unified interface for model acquisition and preparation.
+    """
+
+    def __init__(self, models_dir: Path, environment_manager: EnvironmentManager) -> None:
+        """Initialise model manager with storage and environment configuration.
+
+        Sets up model storage directory and links to environment manager for
+        conversion script access and llama.cpp tool discovery.
+        """
+        self.models_dir = models_dir
+        self.environment_manager = environment_manager
+        self.fs = FilesystemService()
+
+    def prepare_model(self, model_source: ModelSource, llama_env: LlamaCppEnvironment) -> Path:
+        """Prepare model for quantisation and return F16 model path.
+
+        Handles both GGUF repository downloads and regular HuggingFace model
+        conversion workflows with automatic format detection.
+
+        Returns:
+            Path to F16 GGUF model ready for quantisation.
+        """
+        model_dir = self.models_dir / model_source.model_name
+
+        if model_source.is_gguf_repo:
+            return self._handle_gguf_repo(model_source, model_dir)
+        return self._handle_regular_repo(model_source, model_dir, llama_env)
+
+    def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path:
+        """Handle GGUF repository download with pattern matching.
+
+        Downloads GGUF files matching specified patterns, prioritising
+        multi-part files and F16 variants.
+
+        Returns:
+            Path to downloaded or existing GGUF file.
+        """
+        logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
+        logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
+
+        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
+
+        if f16_model.exists():
+            logger.info(f"✅ Found existing F16 file: {f16_model.name}")
+            return f16_model
+
+        # Check for existing GGUF files
+        model_dir.mkdir(parents=True, exist_ok=True)
+        existing_gguf = self.fs.find_gguf_files(model_dir)
+
+        if existing_gguf:
+            logger.info(f"✅ Found existing GGUF file: {existing_gguf[0].name}")
+            return existing_gguf[0]
+
+        # Download with patterns
+        downloaded_file = self._download_gguf_with_patterns(
+            model_source.source_model, model_source.gguf_file_pattern, model_dir
+        )
+
+        if downloaded_file:
+            # Handle multi-part files
+            if "00001-of-" in downloaded_file.name:
+                return downloaded_file
+            if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name:
+                base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace(
+                    "-00003-of-", "-00001-of-"
+                )
+                first_part = downloaded_file.parent / base_name
+                if first_part.exists():
+                    logger.info(f"🔄 Using first part: {first_part.name}")
+                    return first_part
+
+            # Rename single file to standard name
+            downloaded_file.rename(f16_model)
+            return f16_model
+
+        # Fallback to regular conversion
+        logger.info("💡 Falling back to downloading full repository and converting...")
+        return self._handle_regular_repo(
+            ModelSource(**{**model_source.dict(), "is_gguf_repo": False}),
+            model_dir,
+            None,
+        )
+
+    def _download_gguf_with_patterns(
+        self, source_model: str, pattern: str | None, model_dir: Path
+    ) -> Path | None:
+        """Download GGUF file using various pattern strategies.
+
+        Tries multiple pattern variations to find and download appropriate
+        GGUF files, handling timeouts and temporary directories.
+
+        Returns:
+            Path to downloaded file, or None if all patterns fail.
+        """
+        if pattern:
+            patterns = [
+                f"*{pattern}*",
+                f"*{pattern.lower()}*",
+                f"*{pattern.upper()}*",
+                "*f16*",
+                "*F16*",
+                "*fp16*",
+            ]
+        else:
+            patterns = ["*f16*", "*F16*", "*fp16*"]
+
+        temp_dir = model_dir / "gguf_temp"
+
+        for search_pattern in patterns:
+            logger.info(f"🔍 Trying pattern: {search_pattern}")
+            temp_dir.mkdir(exist_ok=True)
+
+            try:
+                subprocess.run(
+                    [
+                        "timeout",
+                        "300",
+                        "huggingface-cli",
+                        "download",
+                        source_model,
+                        "--include",
+                        search_pattern,
+                        "--local-dir",
+                        str(temp_dir),
+                    ],
+                    check=True,
+                    capture_output=True,
+                )
+
+                # Find downloaded GGUF files
+                gguf_files = self.fs.find_gguf_files(temp_dir, pattern)
+                if gguf_files:
+                    found_file = gguf_files[0]
+                    logger.info(f"✅ Found GGUF file: {found_file.name}")
+
+                    # Move to parent directory
+                    final_path = model_dir / found_file.name
+                    shutil.move(str(found_file), str(final_path))
+                    shutil.rmtree(temp_dir)
+                    return final_path
+
+            except subprocess.CalledProcessError:
+                logger.info(f"⚠️ Pattern {search_pattern} failed or timed out")
+                continue
+            finally:
+                if temp_dir.exists():
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+
+        return None
+
+    def _handle_regular_repo(
+        self,
+        model_source: ModelSource,
+        model_dir: Path,
+        llama_env: LlamaCppEnvironment | None,
+    ) -> Path:
+        """Handle regular HuggingFace repository conversion.
+
+        Downloads full model repository and converts to F16 GGUF format
+        using llama.cpp conversion scripts.
+
+        Returns:
+            Path to converted F16 GGUF model.
+        """
+        logger.info(f"⬇️ Downloading source model: {model_source.source_model}")
+
+        if not model_dir.exists():
+            subprocess.run(
+                [
+                    "huggingface-cli",
+                    "download",
+                    model_source.source_model,
+                    "--local-dir",
+                    str(model_dir),
+                ],
+                check=True,
+            )
+        else:
+            logger.info("✅ Model already downloaded")
+
+        logger.info("🔄 Converting to GGUF F16 format...")
+        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
+
+        if not f16_model.exists():
+            if not llama_env:
+                llama_env = self.environment_manager.setup()
+
+            # Ensure conversion script is available
+            if llama_env.use_repo or not self.environment_manager.llama_cpp_dir.exists():
+                logger.info("Getting conversion script from llama.cpp repository...")
+                llama_env = self.environment_manager.setup_repository()
+
+            subprocess.run(
+                [
+                    *llama_env.convert_script.split(),
+                    str(model_dir),
+                    "--outtype",
+                    "f16",
+                    "--outfile",
+                    str(f16_model),
+                ],
+                check=True,
+            )
+        else:
+            logger.info("✅ F16 model already exists")
+
+        return f16_model
+
+
+class HuggingFaceUploader:
+    """Handles uploading models and documentation to HuggingFace.
+
+    Provides methods for repository creation, file uploads, and README
+    updates with proper error handling and retry logic.
+    """
+
+    @staticmethod
+    def get_username() -> str:
+        """Get authenticated HuggingFace username.
+
+        Returns:
+            HuggingFace username from CLI authentication.
+
+        Raises:
+            RuntimeError: If not authenticated.
+        """
+        try:
+            result = subprocess.run(
+                ["huggingface-cli", "whoami"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except (subprocess.CalledProcessError, FileNotFoundError) as err:
+            msg = "Please log in to HuggingFace first: huggingface-cli login"
+            raise RuntimeError(msg) from err
+
+    def upload_readme(self, output_repo: str, readme_path: Path) -> None:
+        """Upload or update README file to repository.
+
+        Creates repository if needed, handles existing repository updates.
+        """
+        logger.info("Uploading README...")
+        try:
+            subprocess.run(
+                [
+                    "huggingface-cli",
+                    "upload",
+                    output_repo,
+                    str(readme_path),
+                    "README.md",
+                    "--create",
+                ],
+                check=True,
+                capture_output=True,
+            )
+            logger.info("README uploaded")
+        except subprocess.CalledProcessError:
+            # Repository exists, update without --create
+            subprocess.run(
+                [
+                    "huggingface-cli",
+                    "upload",
+                    output_repo,
+                    str(readme_path),
+                    "README.md",
+                ],
+                check=True,
+            )
+            logger.info("README updated")
+
+    def upload_model_file(self, output_repo: str, model_path: Path) -> None:
+        """Upload model file to repository.
+
+        Uploads GGUF model file to specified repository path.
+        """
+        logger.info(f"Uploading {model_path.name}...")
+        subprocess.run(
+            [
+                "huggingface-cli",
+                "upload",
+                output_repo,
+                str(model_path),
+                model_path.name,
+            ],
+            check=True,
+        )
+        logger.info(f"{model_path.name} uploaded")