Use proper binaries

2025-08-09 10:55:42 +01:00 · 2025-08-09 10:55:42 +01:00 · 633efdc305
commit 633efdc305
parent d937f2d5fa
13 changed files with 1709 additions and 163 deletions
--- a/.gitignore
+++ b/.gitignore
@ -58,3 +58,4 @@ venv.bak/
 # Working directories
 work/
 quantisation_work/
+.cache/
--- a/helpers/config/quantisation_configs.py
+++ b/helpers/config/quantisation_configs.py
@ -46,15 +46,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
        description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
        base_type="Q3_K_M",
        base_precision=3,
-        output_type="Q5_K",
+        output_type="q5_k",
    ),
    QuantisationType.Q3_K_XL: QuantisationConfig(
        name="Q3_K_XL",
        description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
        base_type="Q3_K_M",
        base_precision=3,
-        embedding_type="Q8_0",
-        output_type="Q6_K",
+        embedding_type="q8_0",
+        output_type="q6_k",
    ),
    QuantisationType.Q4_K_S: QuantisationConfig(
        name="Q4_K_S",
@ -78,7 +78,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
        description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
        base_type="Q4_K_M",
        base_precision=4,
-        embedding_type="Q8_0",
+        embedding_type="q8_0",
    ),
    # Additional standard quantisation profiles
    QuantisationType.Q5_K_S: QuantisationConfig(
@ -103,7 +103,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
        description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
        base_type="Q5_K_M",
        base_precision=5,
-        embedding_type="Q8_0",
+        embedding_type="q8_0",
    ),
    QuantisationType.Q6_K: QuantisationConfig(
        name="Q6_K",
@ -121,7 +121,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
        description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
        base_type="Q6_K",
        base_precision=6,
-        output_type="Q8_0",
+        output_type="q8_0",
    ),
    QuantisationType.Q8_0: QuantisationConfig(
        name="Q8_0",
--- a/helpers/services/binary_manager.py
+++ b/helpers/services/binary_manager.py
@ -0,0 +1,491 @@
+"""Binary manager for llama.cpp releases.
+
+Downloads and manages llama.cpp binary releases from GitHub, handling
+platform detection, version checking, and caching.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import platform
+import shutil
+import subprocess
+import tarfile
+import time
+import zipfile
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar
+from urllib.request import urlopen, urlretrieve
+
+from helpers.logger import logger
+
+if TYPE_CHECKING:
+    from typing import Any
+
+
+class BinaryManager:
+    """Manages llama.cpp binary downloads and updates.
+
+    Automatically downloads appropriate llama.cpp releases based on platform,
+    caches binaries locally, and checks for updates from GitHub releases.
+    """
+
+    GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest"
+    # Use local .cache directory in project
+    BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries"
+
+    # Platform mappings to release asset patterns
+    PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = {
+        ("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"],
+        ("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"],
+        ("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"],
+        ("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"],
+        ("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"],
+    }
+
+    def __init__(self) -> None:
+        """Initialise binary manager."""
+        self.BINARY_DIR.mkdir(parents=True, exist_ok=True)
+        self.version_file = self.BINARY_DIR / "version.json"
+        self.quantize_binary_path = self._get_binary_path("llama-quantize")
+        self.imatrix_binary_path = self._get_binary_path("llama-imatrix")
+
+    def _get_binary_path(self, base_name: str) -> Path:
+        """Get path to binary.
+
+        Args:
+            base_name: Base name of binary (without extension).
+
+        Returns:
+            Path where binary should be located.
+        """
+        binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name
+        return self.BINARY_DIR / binary_name
+
+    def get_quantise_binary(self) -> Path | None:
+        """Get llama-quantize binary, downloading if necessary.
+
+        Returns:
+            Path to binary if available, None if download fails.
+        """
+        return self._get_binary("llama-quantize", self.quantize_binary_path)
+
+    def get_imatrix_binary(self) -> Path | None:
+        """Get llama-imatrix binary, downloading if necessary.
+
+        Returns:
+            Path to binary if available, None if download fails.
+        """
+        return self._get_binary("llama-imatrix", self.imatrix_binary_path)
+
+    def _get_binary(self, name: str, binary_path: Path) -> Path | None:
+        """Get a specific binary, downloading if necessary.
+
+        Args:
+            name: Name of the binary.
+            binary_path: Path where binary should be located.
+
+        Returns:
+            Path to binary if available, None if download fails.
+        """
+        # Check if we have a binary and if it needs updating
+        if self._should_update():
+            logger.info("🔄 Checking for llama.cpp updates...")
+            if not self._download_latest():
+                logger.warning("Failed to download latest llama.cpp release")
+                # Fall back to existing binary if available
+                if binary_path.exists():
+                    logger.info(f"Using existing {name} binary")
+                    return binary_path
+                return None
+
+        if binary_path.exists():
+            return binary_path
+
+        logger.info("📥 Downloading llama.cpp binaries...")
+        if self._download_latest():
+            return binary_path
+
+        return None
+
+    def _should_update(self) -> bool:
+        """Check if binary needs updating.
+
+        Returns:
+            True if update needed, False otherwise.
+        """
+        # If no binaries exist, we need to download
+        if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists():
+            return True
+
+        # Check version file
+        if not self.version_file.exists():
+            return True
+
+        try:
+            with Path(self.version_file).open(encoding="utf-8") as f:
+                cached_version = json.load(f)
+
+            # Check if cached version is older than 7 days
+            if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600:
+                return True
+
+        except Exception:
+            return True
+
+        return False
+
+    def _download_latest(self) -> bool:
+        """Download latest llama.cpp release.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            # Get latest release info
+            release_info = self._get_latest_release()
+            if not release_info:
+                return False
+
+            # Find appropriate asset for platform
+            asset_url = self._find_platform_asset(release_info["assets"])
+            if not asset_url:
+                logger.warning("No suitable binary found for this platform")
+                return False
+
+            # Download and extract
+            logger.info(f"📥 Downloading from: {asset_url}")
+            if not self._download_and_extract(asset_url):
+                return False
+
+            # Save version info
+            self._save_version_info(release_info)
+
+            logger.info("✅ Successfully downloaded llama.cpp binary")
+        except Exception as e:
+            logger.error(f"Failed to download llama.cpp: {e}")
+            return False
+        else:
+            return True
+
+    def _get_latest_release(self) -> dict[str, Any] | None:
+        """Get latest release info from GitHub API.
+
+        Returns:
+            Release info dict or None if failed.
+        """
+        try:
+            with urlopen(self.GITHUB_API) as response:  # noqa: S310
+                return json.loads(response.read())
+        except Exception as e:
+            logger.error(f"Failed to fetch release info: {e}")
+            return None
+
+    def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None:
+        """Find appropriate asset for current platform.
+
+        Returns:
+            Download URL for appropriate asset or None.
+        """
+        patterns = self._get_platform_patterns()
+        if not patterns:
+            return None
+
+        return self._select_best_asset(assets, patterns)
+
+    def _get_platform_patterns(self) -> list[str]:
+        """Get platform patterns for current system.
+
+        Returns:
+            List of patterns to match in asset names.
+        """
+        system = platform.system()
+        machine = platform.machine()
+
+        # Get specific patterns for this platform
+        patterns = self.PLATFORM_PATTERNS.get((system, machine), [])
+        if patterns:
+            return patterns
+
+        # Fall back to generic patterns
+        generic_patterns = {
+            "Linux": ["linux", "ubuntu"],
+            "Darwin": ["macos", "darwin"],
+            "Windows": ["win", "windows"],
+        }
+        return generic_patterns.get(system, [])
+
+    def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None:
+        """Select the best asset from available options.
+
+        Returns:
+            Download URL for best matching asset or None.
+        """
+        avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"]
+        prefer_patterns = ["cpu", "vulkan", "avx2", "avx"]
+
+        best_asset = None
+        best_score = -1
+
+        for asset in assets:
+            name = asset["name"].lower()
+
+            # Skip GPU-specific builds
+            if any(pattern in name for pattern in avoid_patterns):
+                continue
+
+            # Check platform match
+            if not any(pattern in name for pattern in patterns):
+                continue
+
+            score = self._score_asset(name, patterns, prefer_patterns)
+            if score > best_score:
+                best_score = score
+                best_asset = asset
+
+        return best_asset["browser_download_url"] if best_asset else None
+
+    def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int:
+        """Score an asset based on platform and preference matching.
+
+        Returns:
+            Numeric score for asset quality (higher is better).
+        """
+        score = 0
+
+        # Platform match bonus
+        if any(pattern in name for pattern in patterns):
+            score += 10
+
+        # Preference bonuses
+        for pattern in prefer_patterns:
+            if pattern in name:
+                score += 5
+
+        # Archive format preference
+        system = platform.system()
+        if (system == "Windows" and name.endswith(".zip")) or (
+            system != "Windows" and name.endswith(".tar.gz")
+        ):
+            score += 2
+
+        return score
+
+    def _download_and_extract(self, url: str) -> bool:
+        """Download and extract binary archive.
+
+        Args:
+            url: Download URL for archive.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            # Download to temp file
+            temp_file = self.BINARY_DIR / "temp_download"
+            logger.info("⬇️ Downloading archive...")
+            urlretrieve(url, temp_file)  # noqa: S310
+
+            # Extract based on file type
+            if url.endswith(".zip"):
+                with zipfile.ZipFile(temp_file, "r") as zf:
+                    self._extract_binary_from_archive(zf)
+            elif url.endswith((".tar.gz", ".tgz")):
+                with tarfile.open(temp_file, "r:gz") as tf:
+                    self._extract_binary_from_archive(tf)
+            else:
+                logger.error(f"Unknown archive format: {url}")
+                return False
+
+            # Clean up temp file
+            temp_file.unlink()
+
+            # Make binaries executable on Unix
+            if platform.system() != "Windows":
+                self.quantize_binary_path.chmod(0o755)
+                self.imatrix_binary_path.chmod(0o755)
+
+        except Exception as e:
+            logger.error(f"Failed to download and extract: {e}")
+            return False
+        else:
+            return True
+
+    def _extract_binary_from_archive(self, archive: Any) -> None:
+        """Extract llama binaries and their dependencies from archive."""
+        target_binaries = {
+            "llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"],
+            "llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"],
+        }
+
+        # Also extract shared libraries
+        shared_libs = [
+            "libllama.so",
+            "libggml-base.so",
+            "libggml.so",
+            "libllama.dll",
+            "libggml.dll",
+        ]
+
+        members = self._get_archive_members(archive)
+        extracted = self._extract_matching_binaries(archive, members, target_binaries)
+        self._extract_shared_libraries(archive, members, shared_libs)
+        self._cleanup_extracted_directories()
+        self._report_missing_binaries(extracted)
+
+    def _get_archive_members(self, archive: Any) -> list[str]:
+        """Get list of members from archive.
+
+        Returns:
+            List of member names in the archive.
+        """
+        if isinstance(archive, zipfile.ZipFile):
+            return archive.namelist()
+        return [m.name for m in archive.getmembers()]
+
+    def _extract_matching_binaries(
+        self,
+        archive: Any,
+        members: list[str],
+        target_binaries: dict[str, list[str]],
+    ) -> set[str]:
+        """Extract binaries that match target patterns.
+
+        Returns:
+            Set of successfully extracted binary types.
+        """
+        extracted = set()
+        for member in members:
+            base_name = Path(member).name
+
+            for binary_type, possible_names in target_binaries.items():
+                if base_name in possible_names:
+                    self._extract_single_binary(archive, member, binary_type)
+                    extracted.add(binary_type)
+                    break
+        return extracted
+
+    def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None:
+        """Extract a single binary from archive."""
+        logger.info(f"📦 Extracting {Path(member).name} as {binary_type}...")
+        target_path = self._get_binary_path(binary_type)
+
+        if isinstance(archive, zipfile.ZipFile):
+            self._extract_from_zip(archive, member, target_path)
+        else:  # tarfile
+            self._extract_from_tar(archive, member, target_path)
+
+    def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None:
+        """Extract binary from zip archive."""
+        temp_path = self.BINARY_DIR / "temp_binary"
+        with archive.open(member) as source, temp_path.open("wb") as target:
+            shutil.copyfileobj(source, target)
+        shutil.move(str(temp_path), str(target_path))
+
+    def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None:
+        """Extract binary from tar archive."""
+        archive.extract(member, self.BINARY_DIR)
+        extracted_path = self.BINARY_DIR / member
+        if extracted_path != target_path:
+            shutil.move(str(extracted_path), str(target_path))
+
+    def _cleanup_extracted_directories(self) -> None:
+        """Clean up any extracted directories."""
+        for item in self.BINARY_DIR.iterdir():
+            if item.is_dir() and item.name != "binaries":
+                shutil.rmtree(item)
+
+    def _extract_shared_libraries(
+        self, archive: Any, members: list[str], lib_patterns: list[str]
+    ) -> None:
+        """Extract shared libraries needed by the binaries.
+
+        Args:
+            archive: The archive object.
+            members: List of all archive members.
+            lib_patterns: Patterns to match for library files.
+        """
+        for member in members:
+            base_name = Path(member).name
+            if any(lib in base_name for lib in lib_patterns):
+                logger.info(f"📚 Extracting library: {base_name}")
+                target_path = self.BINARY_DIR / base_name
+
+                if isinstance(archive, zipfile.ZipFile):
+                    temp_path = self.BINARY_DIR / "temp_lib"
+                    with archive.open(member) as source, temp_path.open("wb") as target:
+                        shutil.copyfileobj(source, target)
+                    shutil.move(str(temp_path), str(target_path))
+                else:  # tarfile
+                    archive.extract(member, self.BINARY_DIR)
+                    extracted_path = self.BINARY_DIR / member
+                    if extracted_path != target_path:
+                        shutil.move(str(extracted_path), str(target_path))
+
+                # Make libraries executable on Unix
+                if platform.system() != "Windows":
+                    target_path.chmod(0o755)
+
+    def _report_missing_binaries(self, extracted: set[str]) -> None:
+        """Report any missing binaries."""
+        if "llama-quantize" not in extracted:
+            logger.warning("llama-quantize binary not found in archive")
+        if "llama-imatrix" not in extracted:
+            logger.warning("llama-imatrix binary not found in archive")
+
+    def _save_version_info(self, release_info: dict[str, Any]) -> None:
+        """Save version information to cache.
+
+        Args:
+            release_info: GitHub release information.
+        """
+        version_data = {
+            "version": release_info.get("tag_name", "unknown"),
+            "timestamp": time.time(),
+            "url": release_info.get("html_url", ""),
+        }
+
+        with Path(self.version_file).open("w", encoding="utf-8") as f:
+            json.dump(version_data, f, indent=2)
+
+        logger.info(f"📌 Cached version: {version_data['version']}")
+
+    def check_binary_works(self, binary_path: Path | None = None) -> bool:
+        """Check if the binary actually works.
+
+        Args:
+            binary_path: Path to binary to check. If None, checks quantize binary.
+
+        Returns:
+            True if binary executes successfully, False otherwise.
+        """
+        if binary_path is None:
+            binary_path = self.quantize_binary_path
+
+        if not binary_path.exists():
+            return False
+
+        try:
+            # Set LD_LIBRARY_PATH to include binary directory for shared libraries
+            env = os.environ.copy()
+            if platform.system() != "Windows":
+                lib_path = str(self.BINARY_DIR)
+                if "LD_LIBRARY_PATH" in env:
+                    env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
+                else:
+                    env["LD_LIBRARY_PATH"] = lib_path
+
+            result = subprocess.run(
+                [str(binary_path), "--help"],
+                check=False,
+                capture_output=True,
+                text=True,
+                timeout=5,
+                env=env,
+            )
+        except Exception:
+            return False
+        else:
+            # llama-quantize returns 1 for --help but shows usage, which means it works
+            return result.returncode in {0, 1} and "usage:" in result.stdout.lower()
--- a/helpers/services/filesystem.py
+++ b/helpers/services/filesystem.py
@ -34,7 +34,7 @@ class FilesystemService:
        size formatting across the toolset.

        Returns:
-            Human-readable file size string (e.g., "1.5G", "750M").
+            Human-readable file size string (e.g. "1.5G", "750M").
        """
        try:
            result = subprocess.run(
--- a/helpers/services/gguf.py
+++ b/helpers/services/gguf.py
@ -8,6 +8,9 @@ Uses UK English spelling conventions throughout.
 from __future__ import annotations

 import gc
+import json
+import traceback
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Protocol

 import gguf
@ -38,8 +41,6 @@ class TensorMapper(Protocol):


 if TYPE_CHECKING:
-    from pathlib import Path
-
    import numpy as np

    from helpers.models.conversion import ModelConfig
@ -77,6 +78,11 @@ class GGUFWriter:
        self.writer.add_description(f"Converted from {model_config.architectures[0]}")
        self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)

+        # Log architecture being used
+        logger.info(f"Setting GGUF architecture: {self.architecture}")
+        if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
+            logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
+
        # Model parameters from config
        params = model_config.to_gguf_params()
        self.writer.add_context_length(params.context_length)
@ -122,10 +128,239 @@ class GGUFWriter:
        self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
        self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
        self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
-        self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
+
+        # Add BOS/EOS token addition flags if available
+        if "add_bos_token" in tokeniser_config:
+            self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
+        if "add_eos_token" in tokeniser_config:
+            self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
+
+        # Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type

        logger.info("Added tokeniser configuration")

+    def add_tokeniser_vocabulary(self, model_path: Path) -> None:
+        """Add full tokeniser vocabulary to GGUF file.
+
+        Loads and embeds the complete tokeniser vocabulary including tokens,
+        merges, and scores to enable standalone model usage without external
+        tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
+        """
+        tokenizer_path = model_path / "tokenizer.json"
+        if not tokenizer_path.exists():
+            logger.warning("tokenizer.json not found, skipping vocabulary embedding")
+            return
+
+        try:
+            with Path(tokenizer_path).open(encoding="utf-8") as f:
+                tokenizer_data = json.load(f)
+
+            model_data = tokenizer_data.get("model", {})
+            model_type = model_data.get("type", "")
+
+            # Get pre-tokenizer information
+            pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
+            pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
+
+            # Get added tokens
+            added_tokens = tokenizer_data.get("added_tokens", [])
+
+            if model_type == "BPE":
+                self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
+            elif model_type == "Unigram":
+                self._add_unigram_tokenizer(model_data, added_tokens)
+            elif model_type == "WordPiece":
+                self._add_wordpiece_tokenizer(model_data, added_tokens)
+            else:
+                logger.warning(f"Unsupported tokenizer type: {model_type}")
+                # Try to add as generic tokenizer
+                self._add_generic_tokenizer(model_data, tokenizer_data)
+
+        except Exception as e:
+            logger.error(f"Failed to load tokeniser vocabulary: {e}")
+            logger.error(traceback.format_exc())
+
+    def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
+        """Determine pre-tokenizer type from configuration.
+
+        Returns:
+            Pre-tokenizer type.
+        """
+        if not pre_tokenizer:
+            return "default"
+
+        # Check for various pre-tokenizer types
+        pre_type = pre_tokenizer.get("type", "")
+        if "ByteLevel" in str(pre_type):
+            return "llama3"
+        if "Metaspace" in str(pre_type):
+            return "default"
+
+        return "default"
+
+    def _add_bpe_tokenizer(
+        self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str
+    ) -> None:
+        """Add BPE tokenizer vocabulary to GGUF."""
+        vocab = model_data.get("vocab", {})
+        merges = model_data.get("merges", [])
+
+        if not vocab:
+            logger.warning("No vocabulary found in BPE tokenizer")
+            return
+
+        # Create token list sorted by index
+        max_idx = max(vocab.values()) if vocab else 0
+        tokens = [""] * (max_idx + 1)
+
+        for token, idx in vocab.items():
+            if 0 <= idx < len(tokens):
+                tokens[idx] = token
+
+        # Handle added tokens
+        for added_token in added_tokens:
+            token_id = added_token.get("id")
+            content = added_token.get("content")
+            if token_id is not None and content is not None:
+                if token_id >= len(tokens):
+                    tokens.extend([""] * (token_id - len(tokens) + 1))
+                tokens[token_id] = content
+
+        # Prepare token types
+        token_types = []
+        for i, _token in enumerate(tokens):
+            # Check if it's a special/control token
+            is_special = any(
+                added_token.get("id") == i and added_token.get("special", False)
+                for added_token in added_tokens
+            )
+            if is_special:
+                token_types.append(gguf.TokenType.CONTROL)
+            else:
+                token_types.append(gguf.TokenType.NORMAL)
+
+        # Add to GGUF
+        self.writer.add_tokenizer_model("gpt2")
+        self.writer.add_tokenizer_pre(pre_type)
+        self.writer.add_token_list(tokens)
+        self.writer.add_token_scores([0.0] * len(tokens))
+        self.writer.add_token_types(token_types)
+
+        if merges:
+            self.writer.add_token_merges(merges)
+            logger.info(f"Added {len(merges)} BPE merges")
+
+        logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)")
+
+    def _add_unigram_tokenizer(
+        self,
+        model_data: dict[str, Any],
+        added_tokens: list[dict[str, Any]],  # noqa: ARG002
+    ) -> None:
+        """Add Unigram/SentencePiece tokenizer to GGUF."""
+        vocab = model_data.get("vocab", [])
+        if not vocab:
+            logger.warning("No vocabulary found in Unigram tokenizer")
+            return
+
+        tokens = []
+        scores = []
+        token_types = []
+
+        # Process regular vocabulary
+        for item in vocab:
+            if isinstance(item, list) and len(item) >= 2:
+                token = item[0]
+                score = float(item[1]) if len(item) > 1 else 0.0
+                tokens.append(token)
+                scores.append(score)
+
+                # Determine token type
+                if token.startswith("<") and token.endswith(">"):
+                    token_types.append(gguf.TokenType.CONTROL)
+                elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
+                    token_types.append(gguf.TokenType.BYTE)
+                else:
+                    token_types.append(gguf.TokenType.NORMAL)
+
+        # Add to GGUF
+        self.writer.add_tokenizer_model("llama")
+        self.writer.add_tokenizer_pre("default")
+        self.writer.add_token_list(tokens)
+        self.writer.add_token_scores(scores)
+        self.writer.add_token_types(token_types)
+
+        logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)")
+
+    def _add_wordpiece_tokenizer(
+        self,
+        model_data: dict[str, Any],
+        added_tokens: list[dict[str, Any]],  # noqa: ARG002
+    ) -> None:
+        """Add WordPiece tokenizer to GGUF."""
+        vocab = model_data.get("vocab", {})
+        if not vocab:
+            logger.warning("No vocabulary found in WordPiece tokenizer")
+            return
+
+        # Create token list sorted by index
+        max_idx = max(vocab.values()) if vocab else 0
+        tokens = [""] * (max_idx + 1)
+
+        for token, idx in vocab.items():
+            if 0 <= idx < len(tokens):
+                tokens[idx] = token
+
+        # Token types (all normal for WordPiece)
+        token_types = [gguf.TokenType.NORMAL] * len(tokens)
+
+        # Add to GGUF
+        self.writer.add_tokenizer_model("bert")
+        self.writer.add_tokenizer_pre("default")
+        self.writer.add_token_list(tokens)
+        self.writer.add_token_scores([0.0] * len(tokens))
+        self.writer.add_token_types(token_types)
+
+        logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)")
+
+    def _add_generic_tokenizer(
+        self,
+        model_data: dict[str, Any],
+        tokenizer_data: dict[str, Any],  # noqa: ARG002
+    ) -> None:
+        """Try to add a generic tokenizer based on available data."""
+        vocab = model_data.get("vocab")
+        if not vocab:
+            logger.warning("Cannot extract vocabulary from unknown tokenizer type")
+            return
+
+        # Try to extract tokens in a generic way
+        tokens = []
+        if isinstance(vocab, dict):
+            # Dictionary-style vocab
+            max_idx = max(vocab.values()) if vocab else 0
+            tokens = [""] * (max_idx + 1)
+            for token, idx in vocab.items():
+                if 0 <= idx < len(tokens):
+                    tokens[idx] = token
+        elif isinstance(vocab, list):
+            # List-style vocab
+            for item in vocab:
+                if isinstance(item, str):
+                    tokens.append(item)
+                elif isinstance(item, list) and len(item) > 0:
+                    tokens.append(item[0])
+
+        if tokens:
+            self.writer.add_tokenizer_model("llama")  # Default to llama
+            self.writer.add_tokenizer_pre("default")
+            self.writer.add_token_list(tokens)
+            self.writer.add_token_scores([0.0] * len(tokens))
+            self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens))
+            logger.info(f"Added generic tokeniser ({len(tokens)} tokens)")
+        else:
+            logger.warning("Could not extract tokens from unknown tokenizer format")
+
    def add_tensor(self, name: str, data: np.ndarray) -> None:
        """Add a tensor to the GGUF file.

@ -219,13 +454,20 @@ class GGUFConverter:

        logger.info(f"Total tensors processed: {tensor_count}")

-        # Add tokeniser
+        # Add tokeniser configuration
        try:
            tok_config = ConfigParser.load_tokeniser_config(model_path)
            writer_wrapper.add_tokeniser(tok_config)
-            logger.info("Tokeniser added")
+            logger.info("Tokeniser configuration added")
        except Exception as e:
-            logger.warning(f"Could not add tokeniser: {e}")
+            logger.warning(f"Could not add tokeniser configuration: {e}")
+
+        # Add tokeniser vocabulary (critical for standalone usage)
+        try:
+            writer_wrapper.add_tokeniser_vocabulary(model_path)
+        except Exception as e:
+            logger.error(f"Failed to embed tokeniser vocabulary: {e}")
+            logger.error("Model will not work without external tokeniser files!")

        # Finalise file
        writer_wrapper.finalise()
--- a/helpers/services/huggingface.py
+++ b/helpers/services/huggingface.py
@ -7,6 +7,7 @@ spelling conventions throughout.

 from __future__ import annotations

+import json
 import re
 import shutil
 import subprocess
@ -17,6 +18,7 @@ from typing import TYPE_CHECKING
 from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
 from helpers.logger import logger
 from helpers.models.quantisation import QuantisationType
+from helpers.utils.config_parser import ConfigParser

 if TYPE_CHECKING:
    from helpers.models.quantisation import ModelSource, QuantisationResult
@ -260,14 +262,47 @@ class ReadmeGenerator:
        # Get original README content
        original_content = self._get_original_readme(model_source, model_dir)

+        # Get architecture from config.json
+        architecture = self._get_architecture(model_dir)
+
        # Generate new README
        readme_content = self._generate_readme_content(
-            model_source, results, original_content, output_repo
+            model_source, results, original_content, output_repo, architecture, models_dir
        )

        readme_path.write_text(readme_content)
        return readme_path

+    def _get_architecture(self, model_dir: Path) -> str | None:
+        """Get the architecture from the model's config.json.
+
+        Returns:
+            Architecture name or None if not found.
+        """
+        config_path = model_dir / "config.json"
+        if not config_path.exists():
+            return None
+
+        try:
+            with config_path.open(encoding="utf-8") as f:
+                config = json.load(f)
+
+            # Get the architectures field - it's a list
+            architectures = config.get("architectures", [])
+            if architectures:
+                arch_name = architectures[0]
+
+                # Get the mapped architecture (what it will be converted to)
+                parser = ConfigParser()
+                mapped_arch = parser.get_architecture_mapping(arch_name)
+
+                logger.info(f"Architecture: {arch_name} -> {mapped_arch}")
+                return mapped_arch
+        except Exception as e:
+            logger.warning(f"Could not determine architecture: {e}")
+
+        return None
+
    def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
        """Extract original README and metadata.

@ -427,6 +462,8 @@ class ReadmeGenerator:
        results: dict[QuantisationType, QuantisationResult],
        original_content: dict[str, str],
        output_repo: str | None = None,
+        architecture: str | None = None,
+        models_dir: Path | None = None,
    ) -> str:
        """Generate complete README content with quantisation details.

@ -436,22 +473,27 @@ class ReadmeGenerator:
        Returns:
            Complete README markdown content.
        """
-        # Build tags
-        our_tags = [
-            "quantised",
-            "gguf",
-            "q3_k_m",
-            "q3_k_l",
-            "q3_k_xl",
-            "q4_k_m",
-            "q4_k_l",
-            "q5_k_m",
-            "q5_k_l",
-            "q6_k",
-            "q6_k_l",
-            "q8_0",
-            "bartowski-method",
-        ]
+        # Build tags based on actual successful quantisations
+        our_tags = ["gguf"]
+
+        # Add tags for successful quantisations only
+        for quant_type, result in results.items():
+            if hasattr(result, "status") and result.status == "completed":
+                if quant_type == "F16":
+                    our_tags.append("f16")
+                elif hasattr(result, "quantisation_type"):
+                    # Convert to lowercase tag format (e.g., Q3_K_M -> q3_k_m)
+                    our_tags.append(result.quantisation_type.value.lower())
+
+        # If no quantisations succeeded but F16 is available, still add basic tags
+        if (
+            len(our_tags) == 1
+            and "F16" in results
+            and hasattr(results["F16"], "status")
+            and results["F16"].status in {"completed", "uploading"}
+        ):
+            our_tags.append("f16")
+
        original_tags = original_content["tags"].split(",") if original_content["tags"] else []
        all_tags = sorted(set(our_tags + original_tags))

@ -476,8 +518,8 @@ GGUF quantisations of [{model_source.source_model}]({hf_url}) using
 [Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
 which replicates Bartowski's quantisation profiles.

-| Variant | Configuration | File Size | Status |
-|---|---|---|---|
+| Variant | Configuration | Status |
+|---|---|---|
 """

        # Add results table - group by layer config patterns
@ -500,24 +542,91 @@ which replicates Bartowski's quantisation profiles.
                result = type("Result", (), {"status": "planned", "success": False})()

            config = QUANTISATION_CONFIGS.get(quant_type)
-            file_size = self._format_file_size(result)
            status = self._format_status(result, model_source, quant_type, output_repo)

            # Get configuration description from the config itself
-            config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
+            config_desc = (
+                config.get_compact_config(QUANTISATION_CONFIGS)
+                if config
+                else f"{quant_type} all layers"
+            )

-            content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
+            content += f"| **{quant_type.value}** | {config_desc} | {status} |\n"
+
+        # Add F16 row at the bottom if we converted from SafeTensors
+        # Note: Named "f16" for compatibility, but contains mixed F16/F32 tensors
+        # (BF16 source tensors are converted to F32 to preserve precision)
+        if not model_source.is_gguf_repo and output_repo:
+            f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
+            f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}"
+
+            # Get F16 result from results dict (if tracking it)
+            f16_result = results.get("F16")
+
+            # Get file size
+            f16_size = "-"
+            if f16_result and hasattr(f16_result, "file_size"):
+                f16_size = f16_result.file_size
+            elif models_dir:
+                # Try to get from actual file
+                f16_path = models_dir / model_source.model_name / f16_filename
+                if f16_path.exists():
+                    size_bytes = f16_path.stat().st_size
+                    size_gb = size_bytes / GIBIBYTE
+                    f16_size = f"{size_gb:.1f}GB"
+
+            # Format status based on upload state
+            if f16_result and hasattr(f16_result, "status"):
+                if f16_result.status == "uploading":
+                    f16_status = f"⬆️ Uploading... ({f16_size})"
+                elif f16_result.status == "completed":
+                    f16_status = f"[✅ {f16_size}]({f16_url})"
+                else:
+                    f16_status = "⏳ Queued"
+            else:
+                # Default to available if no status tracking
+                f16_status = f"[✅ {f16_size}]({f16_url})"
+
+            content += f"| **F16** | Full precision GGUF (F16/F32 mixed) | {f16_status} |\n"

        content += """

 **Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN

-See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
-for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
-for more on the tools and methods I use.
-
 """

+        # Add warning for unsupported architectures
+        if architecture:
+            supported_archs = {
+                "llama",
+                "qwen2",
+                "gemma",
+                "phi3",
+                "falcon",
+                "gpt2",
+                "gptj",
+                "gptneox",
+                "mpt",
+                "baichuan",
+                "stablelm",
+            }
+            if architecture not in supported_archs:
+                content += (
+                    f"⚠️ **Note:** This model uses the `{architecture}` architecture, which is not "
+                    "yet supported by llama.cpp for quantisation. If quantisations failed, this is "
+                    "why - llama.cpp cannot quantise architectures it doesn't recognise. The F16 "
+                    "GGUF file is provided as a full-precision fallback (requires ~2x model size "
+                    f"in VRAM). For `{architecture}` support, check with your inference software "
+                    "or wait for llama.cpp updates.\n\n"
+                )
+
+        content += (
+            "See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/"
+            "bartowski_analysis.md) for detailed quantisation strategies and "
+            "[Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) "
+            "for more on the tools and methods I use.\n\n"
+        )
+
        # Add original content
        if original_content["readme"]:
            content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
@ -570,6 +679,15 @@ for more on the tools and methods I use.
        if hasattr(result, "status") and result.status in status_map:
            base_status = status_map[result.status]

+            # Check for architecture not supported error
+            if (
+                result.status == "failed"
+                and hasattr(result, "error_message")
+                and result.error_message
+                and "architecture not supported" in str(result.error_message).lower()
+            ):
+                return "⚠️ Skipped"
+
            if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
                return f"{base_status} ({result.file_size})"
            if result.status == "completed" or (hasattr(result, "success") and result.success):
--- a/helpers/services/imatrix_generator.py
+++ b/helpers/services/imatrix_generator.py
@ -0,0 +1,258 @@
+"""Importance matrix generation service.
+
+Generates importance matrices using llama-imatrix binary with calibration
+data for improved quantisation quality.
+"""
+
+from __future__ import annotations
+
+import os
+import platform
+import subprocess
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from helpers.logger import logger
+from helpers.services.binary_manager import BinaryManager
+
+if TYPE_CHECKING:
+    from helpers.models.quantisation import ModelSource
+
+
+class IMatrixGenerator:
+    """Generates importance matrices for quantisation guidance.
+
+    Uses llama-imatrix binary to compute importance matrices from
+    calibration data, which helps preserve model quality during
+    quantisation by identifying critical weights.
+    """
+
+    # Default calibration data location
+    CALIBRATION_DATA = Path("resources") / "imatrix_data.txt"
+
+    def __init__(self) -> None:
+        """Initialise imatrix generator."""
+        self.binary_manager = BinaryManager()
+        self.imatrix_binary = self._get_imatrix_binary()
+
+    def _get_imatrix_binary(self) -> Path | None:
+        """Get llama-imatrix binary, downloading if necessary.
+
+        Returns:
+            Path to binary if found, None otherwise.
+        """
+        # First check local directory for manual placement
+        local_binary = Path("./llama-imatrix")
+        if local_binary.exists():
+            logger.info(f"Using local llama-imatrix binary: {local_binary}")
+            return local_binary
+
+        # Download from GitHub releases
+        binary_path = self.binary_manager.get_imatrix_binary()
+        if binary_path and self.binary_manager.check_binary_works(binary_path):
+            logger.info(f"Using llama-imatrix binary: {binary_path}")
+            return binary_path
+
+        logger.warning("llama-imatrix binary not available")
+        return None
+
+    def can_generate(self) -> bool:
+        """Check if imatrix generation is available.
+
+        Returns:
+            True if binary and calibration data are available.
+        """
+        return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists()
+
+    def generate_imatrix(
+        self,
+        f16_model_path: Path,
+        output_path: Path,
+        calibration_data: Path | None = None,
+    ) -> bool:
+        """Generate importance matrix for a model.
+
+        Returns:
+            True if generation successful, False otherwise.
+        """
+        validation_error = self._validate_generation_inputs(f16_model_path, calibration_data)
+        if validation_error:
+            logger.error(validation_error)
+            return False
+
+        cal_data = calibration_data or self.CALIBRATION_DATA
+        cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path)
+
+        self._log_generation_start(f16_model_path, cal_data, output_path)
+
+        return self._execute_imatrix_generation(cmd, output_path)
+
+    def _validate_generation_inputs(
+        self,
+        f16_model_path: Path,
+        calibration_data: Path | None,
+    ) -> str | None:
+        """Validate inputs for imatrix generation.
+
+        Returns:
+            Error message if validation fails, None if valid.
+        """
+        if not self.imatrix_binary:
+            return "llama-imatrix binary not available"
+
+        if not f16_model_path.exists():
+            return f"Model file not found: {f16_model_path}"
+
+        cal_data = calibration_data or self.CALIBRATION_DATA
+        if not cal_data.exists():
+            return f"Calibration data not found: {cal_data}"
+
+        return None
+
+    def _build_imatrix_command(
+        self,
+        f16_model_path: Path,
+        cal_data: Path,
+        output_path: Path,
+    ) -> list[str]:
+        """Build command for imatrix generation.
+
+        Returns:
+            Command list ready for subprocess execution.
+        """
+        return [
+            str(self.imatrix_binary),
+            "-m",
+            str(f16_model_path),
+            "-f",
+            str(cal_data),
+            "-o",
+            str(output_path),
+            "--chunks",
+            "128",  # Process in chunks for stability
+        ]
+
+    def _log_generation_start(
+        self,
+        f16_model_path: Path,
+        cal_data: Path,
+        output_path: Path,
+    ) -> None:
+        """Log the start of imatrix generation."""
+        logger.info("🧮 Generating importance matrix...")
+        logger.info(f"📊 Model: {f16_model_path.name}")
+        logger.info(f"📝 Calibration data: {cal_data.name}")
+        logger.info(f"💾 Output: {output_path.name}")
+
+    def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool:
+        """Execute the imatrix generation process.
+
+        Returns:
+            True if generation completed successfully, False otherwise.
+        """
+        # Set LD_LIBRARY_PATH for shared libraries
+        env = os.environ.copy()
+        if platform.system() != "Windows":
+            lib_path = str(self.binary_manager.BINARY_DIR)
+            if "LD_LIBRARY_PATH" in env:
+                env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
+            else:
+                env["LD_LIBRARY_PATH"] = lib_path
+
+        try:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+                env=env,
+            )
+
+            self._stream_process_output(process)
+            return self._handle_process_completion(process, output_path)
+
+        except Exception as e:
+            logger.error(f"❌ Imatrix generation failed: {e}")
+            return False
+
+    def _stream_process_output(self, process: subprocess.Popen[str]) -> None:
+        """Stream output from the running process."""
+        while True:
+            if process.stdout is not None:
+                output = process.stdout.readline()
+            else:
+                break
+            if not output and process.poll() is not None:
+                break
+            if output:
+                # Filter progress updates for cleaner output
+                line = output.strip()
+                if line and not line.startswith("["):
+                    logger.info(f"  {line}")
+
+    def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool:
+        """Handle completion of the imatrix generation process.
+
+        Returns:
+            True if process completed successfully and output exists, False otherwise.
+        """
+        return_code = process.poll()
+        if return_code != 0:
+            logger.error(f"❌ Imatrix generation failed with return code {return_code}")
+            return False
+
+        if not output_path.exists():
+            logger.error("Generation completed but output file not found")
+            return False
+
+        size_mb = output_path.stat().st_size / (1024 * 1024)
+        logger.info(f"✅ Generated imatrix: {output_path.name} ({size_mb:.1f} MB)")
+        return True
+
+    def prompt_for_generation(
+        self,
+        model_source: ModelSource,
+        model_dir: Path,
+        f16_model_path: Path,
+    ) -> Path | None:
+        """Prompt user to generate imatrix.
+
+        Args:
+            model_source: Model source information.
+            model_dir: Model directory.
+            f16_model_path: Path to F16 model.
+
+        Returns:
+            Path to generated imatrix or None if skipped.
+        """
+        if not self.can_generate():
+            logger.info("⚠️ Imatrix generation not available (missing binary or calibration data)")
+            return None
+
+        logger.info("\n" + "=" * 70)
+        logger.info("📊 Importance Matrix Generation")
+        logger.info("=" * 70)
+        logger.info(
+            "\nImportance matrices improve quantisation quality by identifying"
+            "\ncritical weights in the model. This process takes 5-10 minutes"
+            "\nbut significantly improves the quality of smaller quantisations."
+        )
+        logger.info(f"\nModel: {model_source.model_name}")
+        logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}")
+
+        response = input("\n❓ Generate importance matrix? (Y/n): ").strip().lower()
+
+        if response == "n":
+            logger.info("Skipping imatrix generation")
+            return None
+
+        # Generate imatrix
+        output_path = model_dir / "imatrix.dat"
+        logger.info("\n⏳ Generating importance matrix (this may take 5-10 minutes)...")
+
+        if self.generate_imatrix(f16_model_path, output_path):
+            return output_path
+
+        logger.warning("Failed to generate imatrix, continuing without it")
+        return None
--- a/helpers/services/llama_cpp.py
+++ b/helpers/services/llama_cpp.py
@ -1,82 +1,294 @@
-"""Importance matrix (imatrix) management service.
+"""Direct llama.cpp binary execution service.

-Manages detection and use of existing importance matrix files for
-quantisation guidance. Provides user prompts for supplying pre-computed
-imatrix files from external sources.
+Provides direct execution of llama.cpp quantisation binary with proper
+tensor-specific override support for L and XL variants.
 """

 from __future__ import annotations

+import os
+import platform
+import subprocess
+from pathlib import Path
 from typing import TYPE_CHECKING

 from helpers.logger import logger
+from helpers.services.binary_manager import BinaryManager
 from helpers.services.filesystem import FilesystemService

 if TYPE_CHECKING:
-    from pathlib import Path
+    from helpers.models.quantisation import QuantisationConfig


-class IMatrixManager:
-    """Handles importance matrix file management for quantisation.
+class QuantisationExecutor:
+    """Executes llama.cpp quantisation with tensor overrides.

-    Locates existing importance matrix files or prompts users to provide
-    pre-computed matrices from external sources. These matrices guide
-    quantisation decisions to preserve model quality.
+    Provides direct binary execution with proper command-line flags for
+    tensor-specific overrides, supporting Bartowski-style L and XL variants.
    """

    def __init__(self) -> None:
-        """Initialise IMatrixManager."""
+        """Initialise quantisation executor."""
+        self.fs = FilesystemService()
+        self.binary_manager = BinaryManager()
+        self.quantise_binary = self._get_quantise_binary()
+        self.last_error: str | None = None  # Track last error type
+
+    def _get_quantise_binary(self) -> Path | None:
+        """Get llama-quantize binary, downloading if necessary.
+
+        Returns:
+            Path to binary if found, None otherwise.
+        """
+        # First check local directory for manual placement
+        local_binary = Path("./llama-quantize")
+        if local_binary.exists():
+            logger.info(f"Using local llama-quantize binary: {local_binary}")
+            return local_binary
+
+        # Download from GitHub releases
+        binary_path = self.binary_manager.get_quantise_binary()
+        if binary_path and self.binary_manager.check_binary_works(binary_path):
+            logger.info(f"Using llama-quantize binary: {binary_path}")
+            return binary_path
+
+        logger.error("Failed to obtain llama-quantize binary")
+        logger.info(
+            "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
+        )
+        return None
+
+    def execute_quantisation(
+        self,
+        input_path: Path,
+        output_path: Path,
+        config: QuantisationConfig,
+        imatrix_path: Path | None = None,
+    ) -> bool:
+        """Execute quantisation using llama.cpp binary.
+
+        Builds and executes llama-quantize command with proper tensor override
+        flags for L and XL variants.
+
+        Returns:
+            True if quantisation successful, False otherwise.
+        """
+        if not self.quantise_binary:
+            logger.error("llama-quantize binary not available")
+            return False
+
+        # Build command
+        cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
+
+        # Execute with real-time output
+        return self._execute_command(cmd)
+
+    def _build_quantisation_command(
+        self,
+        input_path: Path,
+        output_path: Path,
+        config: QuantisationConfig,
+        imatrix_path: Path | None,
+    ) -> list[str]:
+        """Build llama-quantize command with tensor overrides.
+
+        Returns:
+            Command arguments as list.
+        """
+        cmd = [str(self.quantise_binary)]
+
+        # Add imatrix if available
+        if imatrix_path:
+            cmd.extend(["--imatrix", str(imatrix_path)])
+            if imatrix_path.exists():
+                logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
+
+        # Add tensor-specific overrides for L and XL variants
+        if config.embedding_type:
+            # Use directly from config - already in correct format
+            cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
+            logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
+
+        if config.output_type:
+            # Use directly from config - already in correct format
+            cmd.extend(["--output-tensor-type", config.output_type.lower()])
+            logger.info(f"⚙️ Output tensor type: {config.output_type}")
+
+        # Note: Per-layer tensor overrides could be added here if needed in future
+        # For now, embedding and output overrides handle the L/XL variants
+
+        # Get base quantisation type
+        base_quant = self._get_base_quantisation_type(config.name)
+
+        # Add input, output, and base quantisation type
+        cmd.extend([str(input_path), str(output_path), base_quant])
+
+        return cmd
+
+    def _get_base_quantisation_type(self, config_name: str) -> str:
+        """Get base quantisation type for a config.
+
+        Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).
+
+        Returns:
+            Base quantisation type string.
+        """
+        # Mapping of custom variants to base types
+        variant_mapping = {
+            "Q3_K_L": "Q3_K_M",
+            "Q3_K_XL": "Q3_K_M",
+            "Q4_K_L": "Q4_K_M",
+            "Q4_K_XL": "Q4_K_M",
+            "Q5_K_L": "Q5_K_M",
+            "Q5_K_XL": "Q5_K_M",
+            "Q6_K_L": "Q6_K",
+            "Q6_K_XL": "Q6_K",
+        }
+
+        return variant_mapping.get(config_name, config_name)
+
+    def _execute_command(self, cmd: list[str]) -> bool:
+        """Execute command with real-time output streaming.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        logger.info(f"💻 Running: {' '.join(cmd)}")
+        logger.info("⏳ Quantisation in progress... (this may take several minutes)")
+
+        # Set LD_LIBRARY_PATH for shared libraries
+        env = os.environ.copy()
+        if platform.system() != "Windows":
+            lib_path = str(self.binary_manager.BINARY_DIR)
+            if "LD_LIBRARY_PATH" in env:
+                env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
+            else:
+                env["LD_LIBRARY_PATH"] = lib_path
+
+        # Track output for architecture detection
+        output_lines = []
+        architecture_error = False
+
+        try:
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                bufsize=1,
+                env=env,
+            )
+
+            # Stream output
+            while True:
+                if process.stdout is not None:
+                    output = process.stdout.readline()
+                else:
+                    break
+                if not output and process.poll() is not None:
+                    break
+                if output:
+                    output_stripped = output.strip()
+                    logger.info(f"📊 {output_stripped}")
+                    output_lines.append(output_stripped)
+
+                    # Check for architecture-related errors
+                    if any(
+                        phrase in output_stripped.lower()
+                        for phrase in [
+                            "unsupported architecture",
+                            "unknown architecture",
+                            "architecture not supported",
+                            "model architecture",
+                            "llama_model_load: error loading model",
+                        ]
+                    ):
+                        architecture_error = True
+
+            return_code = process.poll()
+            if return_code == 0:
+                logger.info("✅ Quantisation successful!")
+                return True
+
+            # Check if this was an architecture error
+            if architecture_error or return_code == 1:
+                # Look for architecture info in recent output
+                for line in output_lines[-10:]:  # Check last 10 lines
+                    if "architecture" in line.lower():
+                        logger.error("❌ Architecture not supported by llama.cpp")
+                        logger.error("   so cannot be quantised with current llama.cpp but")
+                        logger.error("   F16 GGUF file can be used for inference if supported")
+                        # Store this for the orchestrator to detect
+                        self.last_error = "unsupported_architecture"
+                        return False
+
+            logger.error(f"❌ Quantisation failed with return code {return_code}")
+
+        except Exception as e:
+            logger.error(f"❌ Quantisation failed with exception: {e}")
+            return False
+        else:
+            return False
+
+
+class IMatrixHandler:
+    """Handles importance matrix file management.
+
+    Manages detection and use of existing importance matrix files for
+    quantisation guidance.
+    """
+
+    def __init__(self) -> None:
+        """Initialise IMatrixHandler."""
        self.fs = FilesystemService()

    def find_imatrix(self, model_dir: Path) -> Path | None:
-        """Find or prompt for importance matrix file.
-
-        Searches for existing imatrix files first, then provides interactive
-        prompts for user-supplied matrices. See docs/imatrix_data.md for
-        instructions on generating imatrix files.
+        """Find existing imatrix file in model directory.

        Returns:
-            Path to imatrix file, or None if not available.
+            Path to imatrix file if found, None otherwise.
        """
        imatrix_path = model_dir / "imatrix.dat"

-        # Check for existing imatrix
        if imatrix_path.exists():
-            logger.info(f"Found existing imatrix: {imatrix_path.name}")
+            file_size = self.fs.get_file_size(imatrix_path)
+            logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
            return imatrix_path

-        # Try user-provided imatrix
-        return self._prompt_for_user_imatrix(model_dir, imatrix_path)
+        return None

-    def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
+    def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
        """Prompt user for existing imatrix file.

        Returns:
            Path to user-provided imatrix, or None if not available.
        """
+        imatrix_path = model_dir / "imatrix.dat"
+
        logger.info(f"Model directory: {model_dir}")
        logger.info(f"Looking for imatrix file at: {imatrix_path}")
-        logger.info("\n" + "=" * 70)
-        logger.info("📊 No existing imatrix file found")
-        logger.info("\nYou have two options:")
-        logger.info("  1. Provide a pre-computed imatrix file")
-        logger.info("     (💡 see docs/imatrix_data.md to generate your own)")
-        logger.info("  2. Skip imatrix usage (lower quality quantisation)")
-        logger.info("=" * 70)
+        logger.info(
+            "Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
+        )
+        logger.info(
+            "   Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
+        )

-        response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
+        response = (
+            input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
+            .strip()
+            .lower()
+        )

        if response != "y":
-            logger.info("Continuing without imatrix (quantisation quality may be lower)")
-            logger.info("ℹ️  See docs/imatrix_data.md for instructions on generating imatrix files")  # noqa: RUF001
            return None

-        logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
-        input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
+        logger.info(f"Please place your imatrix.dat file in: {model_dir}")
+        input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")

        if imatrix_path.exists():
            file_size = self.fs.get_file_size(imatrix_path)
-            logger.info(f"✅ Found imatrix file! ({file_size})")
+            logger.info(f"Found imatrix file! ({file_size})")
            return imatrix_path

        logger.warning("No imatrix.dat file found - continuing without imatrix")
--- a/helpers/services/llama_python.py
+++ b/helpers/services/llama_python.py
@ -86,8 +86,8 @@ class LlamaCppPythonAPI:
            raise RuntimeError(msg)

        # Normalise the config name to extract base type
-        # E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
-        # E.g., "Q4_K_M_XXL" -> "Q4_K_M"
+        # e.g. "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
+        # e.g. "Q4_K_M_XXL" -> "Q4_K_M"
        config_upper = config_name.upper()

        # Direct mapping for exact matches
@ -224,7 +224,7 @@ class LlamaCppPythonAPI:
        Args:
            input_path: Path to input GGUF model.
            output_path: Path for output quantised model.
-            base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
+            base_type: Base quantisation type (e.g. "Q4_K_M", "Q6_K").
            embedding_type: Override for token embeddings (None = use base).
            output_type: Override for output/lm_head layers (None = use base).
            imatrix_path: Optional importance matrix file.
@ -470,7 +470,7 @@ class LlamaCppPythonAPI:
        """Log current resource usage state.

        Args:
-            phase: Description of current phase (e.g., "before", "after").
+            phase: Description of current phase (e.g. "before", "after").

        Returns:
            Current memory usage in GB.
--- a/helpers/services/orchestrator.py
+++ b/helpers/services/orchestrator.py
@ -31,12 +31,14 @@ from helpers.models.quantisation import (
    QuantisationType,
 )
 from helpers.services.huggingface import ReadmeGenerator
-from helpers.services.llama_cpp import IMatrixManager
+from helpers.services.imatrix_generator import IMatrixGenerator
+from helpers.services.llama_cpp import IMatrixHandler
 from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
 from helpers.utils.tensor_mapping import URLParser

 if TYPE_CHECKING:
    from types import FrameType
+    from typing import Any


@dataclass(slots=True)
@ -55,7 +57,8 @@ class QuantisationOrchestrator:
    # Service dependencies with factory defaults
    url_parser: URLParser = field(default_factory=URLParser)
    quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
-    imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
+    imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
+    imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
    readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
    uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)

@ -172,18 +175,28 @@ class QuantisationOrchestrator:
        self.models_dir.mkdir(parents=True, exist_ok=True)
        f16_model_path = self.model_manager.prepare_model(model_source)

-        imatrix_path = None
-        if self.use_imatrix:
-            logger.info("Checking for importance matrix (imatrix)...")
-            imatrix_path = self.imatrix_manager.find_imatrix(
-                self.models_dir / model_source.model_name
-            )
-
        output_repo = (
            f"{self.uploader.get_username()}/"
            f"{model_source.original_author}-{model_source.model_name}-GGUF"
        )

+        imatrix_path = None
+        if self.use_imatrix:
+            logger.info("Checking for importance matrix (imatrix)...")
+            model_dir = self.models_dir / model_source.model_name
+            imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
+
+            # If no imatrix found, offer to generate or provide one
+            if not imatrix_path:
+                # First offer to generate
+                imatrix_path = self.imatrix_generator.prompt_for_generation(
+                    model_source, model_dir, f16_model_path
+                )
+
+                # If generation was skipped, offer to provide existing one
+                if not imatrix_path:
+                    imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
+
        return model_source, f16_model_path, imatrix_path, output_repo

    def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
@ -222,10 +235,63 @@ class QuantisationOrchestrator:
        types_list = [qt.value for qt in quantisation_types]
        logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")

+        # Track F16 in results for status display (if we converted from SafeTensors)
+        if not model_source.is_gguf_repo:
+            # Get F16 file size
+            f16_size = "-"
+            if f16_model_path.exists():
+                size_bytes = f16_model_path.stat().st_size
+                size_gb = size_bytes / (1024**3)
+                f16_size = f"{size_gb:.1f}GB"
+
+            # Create a simple object for F16 tracking (not a QuantisationResult)
+            # since F16 isn't a quantisation type in our enum
+            f16_result = type(
+                "F16Result",
+                (),
+                {
+                    "quantisation_type": "F16",
+                    "success": True,
+                    "status": "planned",
+                    "file_path": f16_model_path,
+                    "file_size": f16_size,
+                },
+            )()
+            results["F16"] = f16_result
+
        # Process with parallel uploads - quantise sequentially but upload in background
-        upload_futures = []
+        upload_futures: list[Any] = []
+        architecture_unsupported = False
+
        with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
+            # Start F16 upload first if we have one
+            if not model_source.is_gguf_repo and not self.no_upload and "F16" in results:
+                f16_result = results["F16"]
+                if f16_result.file_path and f16_result.file_path.exists():
+                    logger.info("Starting parallel upload of F16 GGUF...")
+                    f16_result.status = "uploading"
+                    self._update_readme_status(model_source, results, output_repo)
+
+                    upload_future = upload_executor.submit(
+                        self._upload_f16_and_cleanup,
+                        output_repo,
+                        f16_result.file_path,
+                        model_source,
+                        results,
+                    )
+                    upload_futures.append(upload_future)
            for i, quant_type in enumerate(quantisation_types, 1):
+                # Skip remaining quantisations if architecture is unsupported
+                if architecture_unsupported:
+                    logger.info(f"Skipping {quant_type.value} - architecture not supported")
+                    results[quant_type] = QuantisationResult(
+                        quantisation_type=quant_type,
+                        success=False,
+                        status="failed",
+                        error_message="Architecture not supported by llama.cpp",
+                    )
+                    continue
+
                logger.info(
                    f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
                )
@ -247,6 +313,30 @@ class QuantisationOrchestrator:
                    results[quant_type] = result
                    logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")

+                    # Check if this failed due to unsupported architecture
+                    if (
+                        not result.success
+                        and hasattr(self.quantisation_engine.executor, "last_error")
+                        and self.quantisation_engine.executor.last_error
+                        == "unsupported_architecture"
+                    ):
+                        logger.warning(
+                            "Architecture not supported - skipping remaining quantisations"
+                        )
+                        architecture_unsupported = True
+                        # Update the current result to also show as skipped
+                        result.error_message = "Architecture not supported by llama.cpp"
+                        # Update README immediately to show remaining quantizations as skipped
+                        for remaining_quant_type in quantisation_types[i:]:
+                            if remaining_quant_type not in results:
+                                results[remaining_quant_type] = QuantisationResult(
+                                    quantisation_type=remaining_quant_type,
+                                    success=False,
+                                    status="failed",
+                                    error_message="Architecture not supported by llama.cpp",
+                                )
+                        self._update_readme_status(model_source, results, output_repo)
+
                    # Force cleanup between quantisations
                    gc.collect()
                    logger.debug("DEBUG: Garbage collection completed")
@ -269,6 +359,14 @@ class QuantisationOrchestrator:
            # Wait for all uploads to complete before returning
            self._wait_for_uploads(upload_futures)

+            # Final README update to ensure all statuses are accurate
+            if not self.no_upload and upload_futures:
+                logger.info("Updating README with final status...")
+                final_readme = self.readme_generator.generate(
+                    model_source, results, self.models_dir, output_repo
+                )
+                self.uploader.upload_readme(output_repo, final_readme)
+
        return results

    def _process_single_quantisation(
@ -505,12 +603,26 @@ class QuantisationOrchestrator:

    def _wait_for_uploads(self, upload_futures: list) -> None:
        """Wait for all parallel uploads to complete."""
-        logger.info("Waiting for any remaining uploads to complete...")
+        if not upload_futures:
+            return
+
+        logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
+        completed = 0
+        failed = 0
+
        for future in upload_futures:
            try:
                future.result(timeout=300)  # 5 minute timeout per upload
+                completed += 1
+                logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
            except Exception as e:
-                logger.warning(f"Upload error: {e}")
+                failed += 1
+                logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
+
+        if failed > 0:
+            logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
+        else:
+            logger.info(f"All {completed} uploads completed successfully")

    def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
        """Clean up temporary files after processing."""
@ -573,6 +685,45 @@ class QuantisationOrchestrator:
                )
            # Don't re-raise - let other uploads continue

+    def _upload_f16_and_cleanup(
+        self,
+        output_repo: str,
+        file_path: Path,
+        model_source: ModelSource,
+        results: dict[str, QuantisationResult],
+    ) -> None:
+        """Upload F16 file and clean up (runs in background thread)."""
+        try:
+            logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
+            self.uploader.upload_model_file(output_repo, file_path)
+            logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
+
+            # Don't delete F16 yet - we still need it for quantisations
+            # It will be deleted in _cleanup_files after all quantisations complete
+
+            results["F16"].status = "completed"
+            updated_readme_path = self.readme_generator.generate(
+                model_source, results, self.models_dir, output_repo
+            )
+            self.uploader.upload_readme(output_repo, updated_readme_path)
+
+            logger.info("[PARALLEL] F16 upload complete")
+        except Exception as e:
+            logger.error(f"[PARALLEL] Failed to upload F16: {e}")
+            results["F16"].status = "failed"
+            results["F16"].error_message = str(e)
+
+            try:
+                updated_readme_path = self.readme_generator.generate(
+                    model_source, results, self.models_dir, output_repo
+                )
+                self.uploader.upload_readme(output_repo, updated_readme_path)
+            except Exception as readme_error:
+                logger.error(
+                    f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
+                )
+            # Don't re-raise - let other uploads continue
+
    def _print_model_info(self, model_source: ModelSource) -> None:
        """Print model information."""
        logger.info(f"Source URL: {model_source.url}")
--- a/helpers/services/quantisation.py
+++ b/helpers/services/quantisation.py
@ -22,7 +22,7 @@ from helpers.models.quantisation import (
 )
 from helpers.services.filesystem import FilesystemService
 from helpers.services.gguf import GGUFConverter
-from helpers.services.llama_python import LlamaCppPythonAPI
+from helpers.services.llama_cpp import QuantisationExecutor
 from helpers.utils.config_parser import ConfigParser
 from helpers.utils.tensor_mapping import TensorMapper

@ -32,30 +32,28 @@ class QuantisationEngine:

    Provides flexible quantisation execution supporting multiple tensor
    precision configurations, importance matrices, and fallback strategies.
-    Uses llama-cpp-python API for direct quantisation without subprocess overhead.
+    Uses direct llama.cpp binary execution with proper tensor overrides.
    """

    def __init__(self) -> None:
        """Initialise quantisation engine."""
        self.fs = FilesystemService()
-        self.python_api = LlamaCppPythonAPI()
+        self.executor = QuantisationExecutor()

    def quantise(self, context: QuantisationContext) -> QuantisationResult:
        """Perform quantisation using the specified configuration.

-        Executes quantisation using Python API. Since llama-cpp-python is a
-        required dependency, we can rely on it being available.
+        Executes quantisation using direct llama.cpp binary with proper
+        tensor override flags for L and XL variants.

        Returns:
            QuantisationResult with success status and file information.
        """
-        logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
        logger.info(
            f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
        )

        output_path = context.get_output_path()
-        logger.debug(f"DEBUG: Output path: {output_path}")

        # Check input file exists and is readable
        if not context.f16_model_path.exists():
@ -67,34 +65,20 @@ class QuantisationEngine:
                error_message=error_msg,
            )

-        # Check if we have enough disk space (rough estimate)
-        try:
-            input_size = context.f16_model_path.stat().st_size
-            logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
-            # This is a rough check - actual available space calculation is more complex
-            logger.debug(f"DEBUG: Output directory: {output_path.parent}")
-        except Exception as e:
-            logger.warning(f"⚠️ Could not check disk space: {e}")
-
        logger.info(f"🎯 Attempting {context.config.name} quantisation...")
-        logger.debug(f"DEBUG: Source: {context.f16_model_path}")
-        logger.debug(f"DEBUG: Target: {output_path}")
-        logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
+        logger.info(f"📝 Source: {context.f16_model_path}")
+        logger.info(f"📝 Target: {output_path}")

        try:
-            # Use Python API for quantisation
-            logger.info("🐍 Using Python API for quantisation...")
-            logger.debug("DEBUG: Calling python_api.quantise_model...")
+            # Use direct binary execution for quantisation
+            logger.info("🔧 Using llama.cpp binary for quantisation...")

-            success = self.python_api.quantise_model(
+            success = self.executor.execute_quantisation(
                context.f16_model_path, output_path, context.config, context.imatrix_path
            )

-            logger.debug(f"DEBUG: Python API returned: {success}")
-
            if success:
-                logger.debug("DEBUG: Quantisation successful, creating success result")
-                return self._create_success_result(context.config.name, output_path, "Python API")
+                return self._create_success_result(context.config.name, output_path, "llama.cpp")

            logger.error(f"❌ {context.config.name} quantisation failed")
            return QuantisationResult(
@ -175,7 +159,7 @@ class ModelManager:
        logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
        logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")

-        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
+        f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"

        if f16_model.exists():
            logger.info(f"✅ Found existing F16 file: {f16_model.name}")
@ -339,9 +323,17 @@ class ModelManager:
        Raises:
            RuntimeError: If download fails.
        """
+        # Ensure the model directory and .huggingface subdirectory exist
+        model_dir.mkdir(parents=True, exist_ok=True)
+        huggingface_dir = model_dir / ".huggingface"
+        huggingface_dir.mkdir(parents=True, exist_ok=True)
+
        try:
-            logger.debug(f"DEBUG: Downloading full repository: {source_model}")
-            result = subprocess.run(
+            logger.info(f"⬇️ Downloading full repository: {source_model}")
+            logger.info("📊 Progress will be shown below...")
+
+            # Use subprocess.Popen to stream output in real-time
+            process = subprocess.Popen(
                [
                    "huggingface-cli",
                    "download",
@ -349,13 +341,34 @@ class ModelManager:
                    "--local-dir",
                    str(model_dir),
                ],
-                check=True,
-                capture_output=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
                text=True,
+                bufsize=1,  # Line buffered
+                universal_newlines=True,
            )
-            logger.debug(
-                f"DEBUG: Repository download completed with return code {result.returncode}"
-            )
+
+            # Stream output line by line
+            for line in process.stdout:
+                # Log download progress lines
+                if line.strip():
+                    # Check if it's a progress line (contains %)
+                    if "%" in line or "Downloading" in line or "Fetching" in line:
+                        # Use info level for progress lines
+                        logger.info(f"  {line.strip()}")
+                    else:
+                        # Use debug for other output
+                        logger.debug(f"  {line.strip()}")
+
+            # Wait for process to complete
+            return_code = process.wait()
+
+            if return_code != 0:
+                msg = f"Repository download failed with return code {return_code}"
+                raise RuntimeError(msg)
+
+            logger.info("✅ Repository download completed successfully")
+
        except subprocess.CalledProcessError as e:
            logger.error(f"❌ Failed to download repository {source_model}")
            logger.error(f"Return code: {e.returncode}")
@ -386,7 +399,7 @@ class ModelManager:
            RuntimeError: If conversion fails.
        """
        logger.info("🔄 Converting to GGUF F16 format...")
-        f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
+        f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"

        if f16_model.exists():
            logger.info("✅ F16 model already exists")
@ -414,6 +427,28 @@ class ModelManager:
        if arch != arch_name:
            logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")

+        # Check if architecture is supported by llama.cpp
+        supported_archs = {
+            "llama",
+            "qwen2",
+            "gemma",
+            "phi3",
+            "falcon",
+            "gpt2",
+            "gptj",
+            "gptneox",
+            "mpt",
+            "baichuan",
+            "stablelm",
+        }
+
+        if arch not in supported_archs:
+            logger.warning("=" * 70)
+            logger.warning(f"⚠️  Architecture '{arch_name}' may not be supported by llama.cpp")
+            logger.warning(f"⚠️  The GGUF will be created with architecture: '{arch}'")
+            logger.warning("⚠️  Check if your inference software supports this architecture.")
+            logger.warning("=" * 70)
+
        # Convert using GGUFConverter
        tensor_mapper = TensorMapper()
        success = GGUFConverter.convert_safetensors(
--- a/helpers/utils/config_parser.py
+++ b/helpers/utils/config_parser.py
@ -107,28 +107,44 @@ class ConfigParser:

    @staticmethod
    def get_architecture_mapping(architecture: str) -> str:
-        """Map architecture names to known GGUF architectures.
+        """Get the GGUF architecture name for a model.

-        Provides fallback mappings for architectures not directly supported
-        by GGUF format, translating them to similar known architectures. This
-        enables broader model compatibility whilst maintaining GGUF standards.
+        Returns the original architecture name to preserve model identity.
+        Only maps architectures that are truly compatible.

        Returns:
-            GGUF-compatible architecture name with appropriate fallback to llama.
+            Architecture name for GGUF, preserving original when possible.
        """
-        # Architecture mappings to known GGUF types
-        mappings = {
-            "DotsOCRForCausalLM": "qwen2",  # Similar architecture
-            "GptOssForCausalLM": "llama",  # Use llama as fallback
-            "MistralForCausalLM": "llama",  # Mistral is llama-like
-            "Qwen2ForCausalLM": "qwen2",
+        # Only map architectures that are ACTUALLY the same
+        # DO NOT map incompatible architectures
+        known_compatible = {
            "LlamaForCausalLM": "llama",
+            "MistralForCausalLM": "llama",  # Mistral IS llama-compatible
+            "Qwen2ForCausalLM": "qwen2",
            "GemmaForCausalLM": "gemma",
            "Phi3ForCausalLM": "phi3",
-            # Add more mappings as needed
+            "FalconForCausalLM": "falcon",
+            "GPT2LMHeadModel": "gpt2",
+            "GPTJForCausalLM": "gptj",
+            "GPTNeoXForCausalLM": "gptneox",
+            "MPTForCausalLM": "mpt",
+            "BaichuanForCausalLM": "baichuan",
+            "StableLMEpochForCausalLM": "stablelm",
        }

-        return mappings.get(architecture, "llama")  # Default to llama
+        if architecture in known_compatible:
+            return known_compatible[architecture]
+
+        # For unknown architectures, preserve the original name
+        # This will make it clear the model needs proper support
+        # Remove common suffixes to get cleaner architecture name
+        arch_name = architecture
+        for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]:
+            if arch_name.endswith(suffix):
+                arch_name = arch_name[: -len(suffix)]
+                break
+
+        return arch_name.lower()

    @staticmethod
    def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
@ -155,11 +171,33 @@ class ConfigParser:

        config = fs.load_json_config(tokeniser_config_path)

-        # Extract token IDs with defaults
+        # Try to find special token IDs from added_tokens_decoder
+        added_tokens = config.get("added_tokens_decoder", {})
+        eos_token_id = config.get("eos_token_id")
+        bos_token_id = config.get("bos_token_id")
+
+        # If not directly specified, search in added_tokens_decoder
+        if eos_token_id is None:
+            for token_id, token_info in added_tokens.items():
+                if token_info.get("content") == "<|endoftext|>":
+                    eos_token_id = int(token_id)
+                    break
+
+        if bos_token_id is None:
+            for token_id, token_info in added_tokens.items():
+                if token_info.get("content") in {"<|im_start|>", "<s>", "<|startoftext|>"}:
+                    bos_token_id = int(token_id)
+                    break
+
+        # Extract token IDs with better defaults
        return {
-            "bos_token_id": config.get("bos_token_id", 1),
-            "eos_token_id": config.get("eos_token_id", 2),
+            "bos_token_id": bos_token_id if bos_token_id is not None else 1,
+            "eos_token_id": eos_token_id if eos_token_id is not None else 2,
            "unk_token_id": config.get("unk_token_id", 0),
-            "pad_token_id": config.get("pad_token_id", 0),
+            "pad_token_id": config.get(
+                "pad_token_id", eos_token_id if eos_token_id is not None else 0
+            ),
            "model_type": config.get("model_type", "llama"),
+            "add_bos_token": config.get("add_bos_token", True),
+            "add_eos_token": config.get("add_eos_token", False),
        }
--- a/uv.lock
+++ b/uv.lock
@ -496,26 +496,26 @@ wheels = [

 [[package]]
 name = "uv"
-version = "0.8.6"
+version = "0.8.8"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
-    { url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
-    { url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
-    { url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
-    { url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
-    { url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
-    { url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
-    { url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
-    { url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
-    { url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
+    { url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" },
+    { url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" },
+    { url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" },
+    { url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" },
+    { url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" },
+    { url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" },
 ]