Use proper binaries

This commit is contained in:
Tom Foster 2025-08-09 10:55:42 +01:00
parent d937f2d5fa
commit 633efdc305
13 changed files with 1709 additions and 163 deletions

1
.gitignore vendored
View file

@ -58,3 +58,4 @@ venv.bak/
# Working directories
work/
quantisation_work/
.cache/

View file

@ -46,15 +46,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
base_type="Q3_K_M",
base_precision=3,
output_type="Q5_K",
output_type="q5_k",
),
QuantisationType.Q3_K_XL: QuantisationConfig(
name="Q3_K_XL",
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
base_type="Q3_K_M",
base_precision=3,
embedding_type="Q8_0",
output_type="Q6_K",
embedding_type="q8_0",
output_type="q6_k",
),
QuantisationType.Q4_K_S: QuantisationConfig(
name="Q4_K_S",
@ -78,7 +78,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
base_type="Q4_K_M",
base_precision=4,
embedding_type="Q8_0",
embedding_type="q8_0",
),
# Additional standard quantisation profiles
QuantisationType.Q5_K_S: QuantisationConfig(
@ -103,7 +103,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
base_type="Q5_K_M",
base_precision=5,
embedding_type="Q8_0",
embedding_type="q8_0",
),
QuantisationType.Q6_K: QuantisationConfig(
name="Q6_K",
@ -121,7 +121,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
base_type="Q6_K",
base_precision=6,
output_type="Q8_0",
output_type="q8_0",
),
QuantisationType.Q8_0: QuantisationConfig(
name="Q8_0",

View file

@ -0,0 +1,491 @@
"""Binary manager for llama.cpp releases.
Downloads and manages llama.cpp binary releases from GitHub, handling
platform detection, version checking, and caching.
"""
from __future__ import annotations
import json
import os
import platform
import shutil
import subprocess
import tarfile
import time
import zipfile
from pathlib import Path
from typing import TYPE_CHECKING, ClassVar
from urllib.request import urlopen, urlretrieve
from helpers.logger import logger
if TYPE_CHECKING:
from typing import Any
class BinaryManager:
"""Manages llama.cpp binary downloads and updates.
Automatically downloads appropriate llama.cpp releases based on platform,
caches binaries locally, and checks for updates from GitHub releases.
"""
GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest"
# Use local .cache directory in project
BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries"
# Platform mappings to release asset patterns
PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = {
("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"],
("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"],
("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"],
("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"],
("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"],
}
def __init__(self) -> None:
"""Initialise binary manager."""
self.BINARY_DIR.mkdir(parents=True, exist_ok=True)
self.version_file = self.BINARY_DIR / "version.json"
self.quantize_binary_path = self._get_binary_path("llama-quantize")
self.imatrix_binary_path = self._get_binary_path("llama-imatrix")
def _get_binary_path(self, base_name: str) -> Path:
"""Get path to binary.
Args:
base_name: Base name of binary (without extension).
Returns:
Path where binary should be located.
"""
binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name
return self.BINARY_DIR / binary_name
def get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if available, None if download fails.
"""
return self._get_binary("llama-quantize", self.quantize_binary_path)
def get_imatrix_binary(self) -> Path | None:
"""Get llama-imatrix binary, downloading if necessary.
Returns:
Path to binary if available, None if download fails.
"""
return self._get_binary("llama-imatrix", self.imatrix_binary_path)
def _get_binary(self, name: str, binary_path: Path) -> Path | None:
"""Get a specific binary, downloading if necessary.
Args:
name: Name of the binary.
binary_path: Path where binary should be located.
Returns:
Path to binary if available, None if download fails.
"""
# Check if we have a binary and if it needs updating
if self._should_update():
logger.info("🔄 Checking for llama.cpp updates...")
if not self._download_latest():
logger.warning("Failed to download latest llama.cpp release")
# Fall back to existing binary if available
if binary_path.exists():
logger.info(f"Using existing {name} binary")
return binary_path
return None
if binary_path.exists():
return binary_path
logger.info("📥 Downloading llama.cpp binaries...")
if self._download_latest():
return binary_path
return None
def _should_update(self) -> bool:
"""Check if binary needs updating.
Returns:
True if update needed, False otherwise.
"""
# If no binaries exist, we need to download
if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists():
return True
# Check version file
if not self.version_file.exists():
return True
try:
with Path(self.version_file).open(encoding="utf-8") as f:
cached_version = json.load(f)
# Check if cached version is older than 7 days
if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600:
return True
except Exception:
return True
return False
def _download_latest(self) -> bool:
"""Download latest llama.cpp release.
Returns:
True if successful, False otherwise.
"""
try:
# Get latest release info
release_info = self._get_latest_release()
if not release_info:
return False
# Find appropriate asset for platform
asset_url = self._find_platform_asset(release_info["assets"])
if not asset_url:
logger.warning("No suitable binary found for this platform")
return False
# Download and extract
logger.info(f"📥 Downloading from: {asset_url}")
if not self._download_and_extract(asset_url):
return False
# Save version info
self._save_version_info(release_info)
logger.info("✅ Successfully downloaded llama.cpp binary")
except Exception as e:
logger.error(f"Failed to download llama.cpp: {e}")
return False
else:
return True
def _get_latest_release(self) -> dict[str, Any] | None:
"""Get latest release info from GitHub API.
Returns:
Release info dict or None if failed.
"""
try:
with urlopen(self.GITHUB_API) as response: # noqa: S310
return json.loads(response.read())
except Exception as e:
logger.error(f"Failed to fetch release info: {e}")
return None
def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None:
"""Find appropriate asset for current platform.
Returns:
Download URL for appropriate asset or None.
"""
patterns = self._get_platform_patterns()
if not patterns:
return None
return self._select_best_asset(assets, patterns)
def _get_platform_patterns(self) -> list[str]:
"""Get platform patterns for current system.
Returns:
List of patterns to match in asset names.
"""
system = platform.system()
machine = platform.machine()
# Get specific patterns for this platform
patterns = self.PLATFORM_PATTERNS.get((system, machine), [])
if patterns:
return patterns
# Fall back to generic patterns
generic_patterns = {
"Linux": ["linux", "ubuntu"],
"Darwin": ["macos", "darwin"],
"Windows": ["win", "windows"],
}
return generic_patterns.get(system, [])
def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None:
"""Select the best asset from available options.
Returns:
Download URL for best matching asset or None.
"""
avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"]
prefer_patterns = ["cpu", "vulkan", "avx2", "avx"]
best_asset = None
best_score = -1
for asset in assets:
name = asset["name"].lower()
# Skip GPU-specific builds
if any(pattern in name for pattern in avoid_patterns):
continue
# Check platform match
if not any(pattern in name for pattern in patterns):
continue
score = self._score_asset(name, patterns, prefer_patterns)
if score > best_score:
best_score = score
best_asset = asset
return best_asset["browser_download_url"] if best_asset else None
def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int:
"""Score an asset based on platform and preference matching.
Returns:
Numeric score for asset quality (higher is better).
"""
score = 0
# Platform match bonus
if any(pattern in name for pattern in patterns):
score += 10
# Preference bonuses
for pattern in prefer_patterns:
if pattern in name:
score += 5
# Archive format preference
system = platform.system()
if (system == "Windows" and name.endswith(".zip")) or (
system != "Windows" and name.endswith(".tar.gz")
):
score += 2
return score
def _download_and_extract(self, url: str) -> bool:
"""Download and extract binary archive.
Args:
url: Download URL for archive.
Returns:
True if successful, False otherwise.
"""
try:
# Download to temp file
temp_file = self.BINARY_DIR / "temp_download"
logger.info("⬇️ Downloading archive...")
urlretrieve(url, temp_file) # noqa: S310
# Extract based on file type
if url.endswith(".zip"):
with zipfile.ZipFile(temp_file, "r") as zf:
self._extract_binary_from_archive(zf)
elif url.endswith((".tar.gz", ".tgz")):
with tarfile.open(temp_file, "r:gz") as tf:
self._extract_binary_from_archive(tf)
else:
logger.error(f"Unknown archive format: {url}")
return False
# Clean up temp file
temp_file.unlink()
# Make binaries executable on Unix
if platform.system() != "Windows":
self.quantize_binary_path.chmod(0o755)
self.imatrix_binary_path.chmod(0o755)
except Exception as e:
logger.error(f"Failed to download and extract: {e}")
return False
else:
return True
def _extract_binary_from_archive(self, archive: Any) -> None:
"""Extract llama binaries and their dependencies from archive."""
target_binaries = {
"llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"],
"llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"],
}
# Also extract shared libraries
shared_libs = [
"libllama.so",
"libggml-base.so",
"libggml.so",
"libllama.dll",
"libggml.dll",
]
members = self._get_archive_members(archive)
extracted = self._extract_matching_binaries(archive, members, target_binaries)
self._extract_shared_libraries(archive, members, shared_libs)
self._cleanup_extracted_directories()
self._report_missing_binaries(extracted)
def _get_archive_members(self, archive: Any) -> list[str]:
"""Get list of members from archive.
Returns:
List of member names in the archive.
"""
if isinstance(archive, zipfile.ZipFile):
return archive.namelist()
return [m.name for m in archive.getmembers()]
def _extract_matching_binaries(
self,
archive: Any,
members: list[str],
target_binaries: dict[str, list[str]],
) -> set[str]:
"""Extract binaries that match target patterns.
Returns:
Set of successfully extracted binary types.
"""
extracted = set()
for member in members:
base_name = Path(member).name
for binary_type, possible_names in target_binaries.items():
if base_name in possible_names:
self._extract_single_binary(archive, member, binary_type)
extracted.add(binary_type)
break
return extracted
def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None:
"""Extract a single binary from archive."""
logger.info(f"📦 Extracting {Path(member).name} as {binary_type}...")
target_path = self._get_binary_path(binary_type)
if isinstance(archive, zipfile.ZipFile):
self._extract_from_zip(archive, member, target_path)
else: # tarfile
self._extract_from_tar(archive, member, target_path)
def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None:
"""Extract binary from zip archive."""
temp_path = self.BINARY_DIR / "temp_binary"
with archive.open(member) as source, temp_path.open("wb") as target:
shutil.copyfileobj(source, target)
shutil.move(str(temp_path), str(target_path))
def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None:
"""Extract binary from tar archive."""
archive.extract(member, self.BINARY_DIR)
extracted_path = self.BINARY_DIR / member
if extracted_path != target_path:
shutil.move(str(extracted_path), str(target_path))
def _cleanup_extracted_directories(self) -> None:
"""Clean up any extracted directories."""
for item in self.BINARY_DIR.iterdir():
if item.is_dir() and item.name != "binaries":
shutil.rmtree(item)
def _extract_shared_libraries(
self, archive: Any, members: list[str], lib_patterns: list[str]
) -> None:
"""Extract shared libraries needed by the binaries.
Args:
archive: The archive object.
members: List of all archive members.
lib_patterns: Patterns to match for library files.
"""
for member in members:
base_name = Path(member).name
if any(lib in base_name for lib in lib_patterns):
logger.info(f"📚 Extracting library: {base_name}")
target_path = self.BINARY_DIR / base_name
if isinstance(archive, zipfile.ZipFile):
temp_path = self.BINARY_DIR / "temp_lib"
with archive.open(member) as source, temp_path.open("wb") as target:
shutil.copyfileobj(source, target)
shutil.move(str(temp_path), str(target_path))
else: # tarfile
archive.extract(member, self.BINARY_DIR)
extracted_path = self.BINARY_DIR / member
if extracted_path != target_path:
shutil.move(str(extracted_path), str(target_path))
# Make libraries executable on Unix
if platform.system() != "Windows":
target_path.chmod(0o755)
def _report_missing_binaries(self, extracted: set[str]) -> None:
"""Report any missing binaries."""
if "llama-quantize" not in extracted:
logger.warning("llama-quantize binary not found in archive")
if "llama-imatrix" not in extracted:
logger.warning("llama-imatrix binary not found in archive")
def _save_version_info(self, release_info: dict[str, Any]) -> None:
"""Save version information to cache.
Args:
release_info: GitHub release information.
"""
version_data = {
"version": release_info.get("tag_name", "unknown"),
"timestamp": time.time(),
"url": release_info.get("html_url", ""),
}
with Path(self.version_file).open("w", encoding="utf-8") as f:
json.dump(version_data, f, indent=2)
logger.info(f"📌 Cached version: {version_data['version']}")
def check_binary_works(self, binary_path: Path | None = None) -> bool:
"""Check if the binary actually works.
Args:
binary_path: Path to binary to check. If None, checks quantize binary.
Returns:
True if binary executes successfully, False otherwise.
"""
if binary_path is None:
binary_path = self.quantize_binary_path
if not binary_path.exists():
return False
try:
# Set LD_LIBRARY_PATH to include binary directory for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
result = subprocess.run(
[str(binary_path), "--help"],
check=False,
capture_output=True,
text=True,
timeout=5,
env=env,
)
except Exception:
return False
else:
# llama-quantize returns 1 for --help but shows usage, which means it works
return result.returncode in {0, 1} and "usage:" in result.stdout.lower()

View file

@ -34,7 +34,7 @@ class FilesystemService:
size formatting across the toolset.
Returns:
Human-readable file size string (e.g., "1.5G", "750M").
Human-readable file size string (e.g. "1.5G", "750M").
"""
try:
result = subprocess.run(

View file

@ -8,6 +8,9 @@ Uses UK English spelling conventions throughout.
from __future__ import annotations
import gc
import json
import traceback
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol
import gguf
@ -38,8 +41,6 @@ class TensorMapper(Protocol):
if TYPE_CHECKING:
from pathlib import Path
import numpy as np
from helpers.models.conversion import ModelConfig
@ -77,6 +78,11 @@ class GGUFWriter:
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
# Log architecture being used
logger.info(f"Setting GGUF architecture: {self.architecture}")
if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
# Model parameters from config
params = model_config.to_gguf_params()
self.writer.add_context_length(params.context_length)
@ -122,10 +128,239 @@ class GGUFWriter:
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
# Add BOS/EOS token addition flags if available
if "add_bos_token" in tokeniser_config:
self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
if "add_eos_token" in tokeniser_config:
self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
# Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type
logger.info("Added tokeniser configuration")
def add_tokeniser_vocabulary(self, model_path: Path) -> None:
"""Add full tokeniser vocabulary to GGUF file.
Loads and embeds the complete tokeniser vocabulary including tokens,
merges, and scores to enable standalone model usage without external
tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
"""
tokenizer_path = model_path / "tokenizer.json"
if not tokenizer_path.exists():
logger.warning("tokenizer.json not found, skipping vocabulary embedding")
return
try:
with Path(tokenizer_path).open(encoding="utf-8") as f:
tokenizer_data = json.load(f)
model_data = tokenizer_data.get("model", {})
model_type = model_data.get("type", "")
# Get pre-tokenizer information
pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
# Get added tokens
added_tokens = tokenizer_data.get("added_tokens", [])
if model_type == "BPE":
self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
elif model_type == "Unigram":
self._add_unigram_tokenizer(model_data, added_tokens)
elif model_type == "WordPiece":
self._add_wordpiece_tokenizer(model_data, added_tokens)
else:
logger.warning(f"Unsupported tokenizer type: {model_type}")
# Try to add as generic tokenizer
self._add_generic_tokenizer(model_data, tokenizer_data)
except Exception as e:
logger.error(f"Failed to load tokeniser vocabulary: {e}")
logger.error(traceback.format_exc())
def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
"""Determine pre-tokenizer type from configuration.
Returns:
Pre-tokenizer type.
"""
if not pre_tokenizer:
return "default"
# Check for various pre-tokenizer types
pre_type = pre_tokenizer.get("type", "")
if "ByteLevel" in str(pre_type):
return "llama3"
if "Metaspace" in str(pre_type):
return "default"
return "default"
def _add_bpe_tokenizer(
self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str
) -> None:
"""Add BPE tokenizer vocabulary to GGUF."""
vocab = model_data.get("vocab", {})
merges = model_data.get("merges", [])
if not vocab:
logger.warning("No vocabulary found in BPE tokenizer")
return
# Create token list sorted by index
max_idx = max(vocab.values()) if vocab else 0
tokens = [""] * (max_idx + 1)
for token, idx in vocab.items():
if 0 <= idx < len(tokens):
tokens[idx] = token
# Handle added tokens
for added_token in added_tokens:
token_id = added_token.get("id")
content = added_token.get("content")
if token_id is not None and content is not None:
if token_id >= len(tokens):
tokens.extend([""] * (token_id - len(tokens) + 1))
tokens[token_id] = content
# Prepare token types
token_types = []
for i, _token in enumerate(tokens):
# Check if it's a special/control token
is_special = any(
added_token.get("id") == i and added_token.get("special", False)
for added_token in added_tokens
)
if is_special:
token_types.append(gguf.TokenType.CONTROL)
else:
token_types.append(gguf.TokenType.NORMAL)
# Add to GGUF
self.writer.add_tokenizer_model("gpt2")
self.writer.add_tokenizer_pre(pre_type)
self.writer.add_token_list(tokens)
self.writer.add_token_scores([0.0] * len(tokens))
self.writer.add_token_types(token_types)
if merges:
self.writer.add_token_merges(merges)
logger.info(f"Added {len(merges)} BPE merges")
logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)")
def _add_unigram_tokenizer(
self,
model_data: dict[str, Any],
added_tokens: list[dict[str, Any]], # noqa: ARG002
) -> None:
"""Add Unigram/SentencePiece tokenizer to GGUF."""
vocab = model_data.get("vocab", [])
if not vocab:
logger.warning("No vocabulary found in Unigram tokenizer")
return
tokens = []
scores = []
token_types = []
# Process regular vocabulary
for item in vocab:
if isinstance(item, list) and len(item) >= 2:
token = item[0]
score = float(item[1]) if len(item) > 1 else 0.0
tokens.append(token)
scores.append(score)
# Determine token type
if token.startswith("<") and token.endswith(">"):
token_types.append(gguf.TokenType.CONTROL)
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
token_types.append(gguf.TokenType.BYTE)
else:
token_types.append(gguf.TokenType.NORMAL)
# Add to GGUF
self.writer.add_tokenizer_model("llama")
self.writer.add_tokenizer_pre("default")
self.writer.add_token_list(tokens)
self.writer.add_token_scores(scores)
self.writer.add_token_types(token_types)
logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)")
def _add_wordpiece_tokenizer(
self,
model_data: dict[str, Any],
added_tokens: list[dict[str, Any]], # noqa: ARG002
) -> None:
"""Add WordPiece tokenizer to GGUF."""
vocab = model_data.get("vocab", {})
if not vocab:
logger.warning("No vocabulary found in WordPiece tokenizer")
return
# Create token list sorted by index
max_idx = max(vocab.values()) if vocab else 0
tokens = [""] * (max_idx + 1)
for token, idx in vocab.items():
if 0 <= idx < len(tokens):
tokens[idx] = token
# Token types (all normal for WordPiece)
token_types = [gguf.TokenType.NORMAL] * len(tokens)
# Add to GGUF
self.writer.add_tokenizer_model("bert")
self.writer.add_tokenizer_pre("default")
self.writer.add_token_list(tokens)
self.writer.add_token_scores([0.0] * len(tokens))
self.writer.add_token_types(token_types)
logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)")
def _add_generic_tokenizer(
self,
model_data: dict[str, Any],
tokenizer_data: dict[str, Any], # noqa: ARG002
) -> None:
"""Try to add a generic tokenizer based on available data."""
vocab = model_data.get("vocab")
if not vocab:
logger.warning("Cannot extract vocabulary from unknown tokenizer type")
return
# Try to extract tokens in a generic way
tokens = []
if isinstance(vocab, dict):
# Dictionary-style vocab
max_idx = max(vocab.values()) if vocab else 0
tokens = [""] * (max_idx + 1)
for token, idx in vocab.items():
if 0 <= idx < len(tokens):
tokens[idx] = token
elif isinstance(vocab, list):
# List-style vocab
for item in vocab:
if isinstance(item, str):
tokens.append(item)
elif isinstance(item, list) and len(item) > 0:
tokens.append(item[0])
if tokens:
self.writer.add_tokenizer_model("llama") # Default to llama
self.writer.add_tokenizer_pre("default")
self.writer.add_token_list(tokens)
self.writer.add_token_scores([0.0] * len(tokens))
self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens))
logger.info(f"Added generic tokeniser ({len(tokens)} tokens)")
else:
logger.warning("Could not extract tokens from unknown tokenizer format")
def add_tensor(self, name: str, data: np.ndarray) -> None:
"""Add a tensor to the GGUF file.
@ -219,13 +454,20 @@ class GGUFConverter:
logger.info(f"Total tensors processed: {tensor_count}")
# Add tokeniser
# Add tokeniser configuration
try:
tok_config = ConfigParser.load_tokeniser_config(model_path)
writer_wrapper.add_tokeniser(tok_config)
logger.info("Tokeniser added")
logger.info("Tokeniser configuration added")
except Exception as e:
logger.warning(f"Could not add tokeniser: {e}")
logger.warning(f"Could not add tokeniser configuration: {e}")
# Add tokeniser vocabulary (critical for standalone usage)
try:
writer_wrapper.add_tokeniser_vocabulary(model_path)
except Exception as e:
logger.error(f"Failed to embed tokeniser vocabulary: {e}")
logger.error("Model will not work without external tokeniser files!")
# Finalise file
writer_wrapper.finalise()

View file

@ -7,6 +7,7 @@ spelling conventions throughout.
from __future__ import annotations
import json
import re
import shutil
import subprocess
@ -17,6 +18,7 @@ from typing import TYPE_CHECKING
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
from helpers.logger import logger
from helpers.models.quantisation import QuantisationType
from helpers.utils.config_parser import ConfigParser
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource, QuantisationResult
@ -260,14 +262,47 @@ class ReadmeGenerator:
# Get original README content
original_content = self._get_original_readme(model_source, model_dir)
# Get architecture from config.json
architecture = self._get_architecture(model_dir)
# Generate new README
readme_content = self._generate_readme_content(
model_source, results, original_content, output_repo
model_source, results, original_content, output_repo, architecture, models_dir
)
readme_path.write_text(readme_content)
return readme_path
def _get_architecture(self, model_dir: Path) -> str | None:
"""Get the architecture from the model's config.json.
Returns:
Architecture name or None if not found.
"""
config_path = model_dir / "config.json"
if not config_path.exists():
return None
try:
with config_path.open(encoding="utf-8") as f:
config = json.load(f)
# Get the architectures field - it's a list
architectures = config.get("architectures", [])
if architectures:
arch_name = architectures[0]
# Get the mapped architecture (what it will be converted to)
parser = ConfigParser()
mapped_arch = parser.get_architecture_mapping(arch_name)
logger.info(f"Architecture: {arch_name} -> {mapped_arch}")
return mapped_arch
except Exception as e:
logger.warning(f"Could not determine architecture: {e}")
return None
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
"""Extract original README and metadata.
@ -427,6 +462,8 @@ class ReadmeGenerator:
results: dict[QuantisationType, QuantisationResult],
original_content: dict[str, str],
output_repo: str | None = None,
architecture: str | None = None,
models_dir: Path | None = None,
) -> str:
"""Generate complete README content with quantisation details.
@ -436,22 +473,27 @@ class ReadmeGenerator:
Returns:
Complete README markdown content.
"""
# Build tags
our_tags = [
"quantised",
"gguf",
"q3_k_m",
"q3_k_l",
"q3_k_xl",
"q4_k_m",
"q4_k_l",
"q5_k_m",
"q5_k_l",
"q6_k",
"q6_k_l",
"q8_0",
"bartowski-method",
]
# Build tags based on actual successful quantisations
our_tags = ["gguf"]
# Add tags for successful quantisations only
for quant_type, result in results.items():
if hasattr(result, "status") and result.status == "completed":
if quant_type == "F16":
our_tags.append("f16")
elif hasattr(result, "quantisation_type"):
# Convert to lowercase tag format (e.g., Q3_K_M -> q3_k_m)
our_tags.append(result.quantisation_type.value.lower())
# If no quantisations succeeded but F16 is available, still add basic tags
if (
len(our_tags) == 1
and "F16" in results
and hasattr(results["F16"], "status")
and results["F16"].status in {"completed", "uploading"}
):
our_tags.append("f16")
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
all_tags = sorted(set(our_tags + original_tags))
@ -476,8 +518,8 @@ GGUF quantisations of [{model_source.source_model}]({hf_url}) using
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
which replicates Bartowski's quantisation profiles.
| Variant | Configuration | File Size | Status |
|---|---|---|---|
| Variant | Configuration | Status |
|---|---|---|
"""
# Add results table - group by layer config patterns
@ -500,24 +542,91 @@ which replicates Bartowski's quantisation profiles.
result = type("Result", (), {"status": "planned", "success": False})()
config = QUANTISATION_CONFIGS.get(quant_type)
file_size = self._format_file_size(result)
status = self._format_status(result, model_source, quant_type, output_repo)
# Get configuration description from the config itself
config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
config_desc = (
config.get_compact_config(QUANTISATION_CONFIGS)
if config
else f"{quant_type} all layers"
)
content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
content += f"| **{quant_type.value}** | {config_desc} | {status} |\n"
# Add F16 row at the bottom if we converted from SafeTensors
# Note: Named "f16" for compatibility, but contains mixed F16/F32 tensors
# (BF16 source tensors are converted to F32 to preserve precision)
if not model_source.is_gguf_repo and output_repo:
f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}"
# Get F16 result from results dict (if tracking it)
f16_result = results.get("F16")
# Get file size
f16_size = "-"
if f16_result and hasattr(f16_result, "file_size"):
f16_size = f16_result.file_size
elif models_dir:
# Try to get from actual file
f16_path = models_dir / model_source.model_name / f16_filename
if f16_path.exists():
size_bytes = f16_path.stat().st_size
size_gb = size_bytes / GIBIBYTE
f16_size = f"{size_gb:.1f}GB"
# Format status based on upload state
if f16_result and hasattr(f16_result, "status"):
if f16_result.status == "uploading":
f16_status = f"⬆️ Uploading... ({f16_size})"
elif f16_result.status == "completed":
f16_status = f"[✅ {f16_size}]({f16_url})"
else:
f16_status = "⏳ Queued"
else:
# Default to available if no status tracking
f16_status = f"[✅ {f16_size}]({f16_url})"
content += f"| **F16** | Full precision GGUF (F16/F32 mixed) | {f16_status} |\n"
content += """
**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN
See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
for more on the tools and methods I use.
"""
# Add warning for unsupported architectures
if architecture:
supported_archs = {
"llama",
"qwen2",
"gemma",
"phi3",
"falcon",
"gpt2",
"gptj",
"gptneox",
"mpt",
"baichuan",
"stablelm",
}
if architecture not in supported_archs:
content += (
f"⚠️ **Note:** This model uses the `{architecture}` architecture, which is not "
"yet supported by llama.cpp for quantisation. If quantisations failed, this is "
"why - llama.cpp cannot quantise architectures it doesn't recognise. The F16 "
"GGUF file is provided as a full-precision fallback (requires ~2x model size "
f"in VRAM). For `{architecture}` support, check with your inference software "
"or wait for llama.cpp updates.\n\n"
)
content += (
"See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/"
"bartowski_analysis.md) for detailed quantisation strategies and "
"[Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) "
"for more on the tools and methods I use.\n\n"
)
# Add original content
if original_content["readme"]:
content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
@ -570,6 +679,15 @@ for more on the tools and methods I use.
if hasattr(result, "status") and result.status in status_map:
base_status = status_map[result.status]
# Check for architecture not supported error
if (
result.status == "failed"
and hasattr(result, "error_message")
and result.error_message
and "architecture not supported" in str(result.error_message).lower()
):
return "⚠️ Skipped"
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
return f"{base_status} ({result.file_size})"
if result.status == "completed" or (hasattr(result, "success") and result.success):

View file

@ -0,0 +1,258 @@
"""Importance matrix generation service.
Generates importance matrices using llama-imatrix binary with calibration
data for improved quantisation quality.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.services.binary_manager import BinaryManager
if TYPE_CHECKING:
from helpers.models.quantisation import ModelSource
class IMatrixGenerator:
"""Generates importance matrices for quantisation guidance.
Uses llama-imatrix binary to compute importance matrices from
calibration data, which helps preserve model quality during
quantisation by identifying critical weights.
"""
# Default calibration data location
CALIBRATION_DATA = Path("resources") / "imatrix_data.txt"
def __init__(self) -> None:
"""Initialise imatrix generator."""
self.binary_manager = BinaryManager()
self.imatrix_binary = self._get_imatrix_binary()
def _get_imatrix_binary(self) -> Path | None:
"""Get llama-imatrix binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-imatrix")
if local_binary.exists():
logger.info(f"Using local llama-imatrix binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_imatrix_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-imatrix binary: {binary_path}")
return binary_path
logger.warning("llama-imatrix binary not available")
return None
def can_generate(self) -> bool:
"""Check if imatrix generation is available.
Returns:
True if binary and calibration data are available.
"""
return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists()
def generate_imatrix(
self,
f16_model_path: Path,
output_path: Path,
calibration_data: Path | None = None,
) -> bool:
"""Generate importance matrix for a model.
Returns:
True if generation successful, False otherwise.
"""
validation_error = self._validate_generation_inputs(f16_model_path, calibration_data)
if validation_error:
logger.error(validation_error)
return False
cal_data = calibration_data or self.CALIBRATION_DATA
cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path)
self._log_generation_start(f16_model_path, cal_data, output_path)
return self._execute_imatrix_generation(cmd, output_path)
def _validate_generation_inputs(
self,
f16_model_path: Path,
calibration_data: Path | None,
) -> str | None:
"""Validate inputs for imatrix generation.
Returns:
Error message if validation fails, None if valid.
"""
if not self.imatrix_binary:
return "llama-imatrix binary not available"
if not f16_model_path.exists():
return f"Model file not found: {f16_model_path}"
cal_data = calibration_data or self.CALIBRATION_DATA
if not cal_data.exists():
return f"Calibration data not found: {cal_data}"
return None
def _build_imatrix_command(
self,
f16_model_path: Path,
cal_data: Path,
output_path: Path,
) -> list[str]:
"""Build command for imatrix generation.
Returns:
Command list ready for subprocess execution.
"""
return [
str(self.imatrix_binary),
"-m",
str(f16_model_path),
"-f",
str(cal_data),
"-o",
str(output_path),
"--chunks",
"128", # Process in chunks for stability
]
def _log_generation_start(
self,
f16_model_path: Path,
cal_data: Path,
output_path: Path,
) -> None:
"""Log the start of imatrix generation."""
logger.info("🧮 Generating importance matrix...")
logger.info(f"📊 Model: {f16_model_path.name}")
logger.info(f"📝 Calibration data: {cal_data.name}")
logger.info(f"💾 Output: {output_path.name}")
def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool:
"""Execute the imatrix generation process.
Returns:
True if generation completed successfully, False otherwise.
"""
# Set LD_LIBRARY_PATH for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
env=env,
)
self._stream_process_output(process)
return self._handle_process_completion(process, output_path)
except Exception as e:
logger.error(f"❌ Imatrix generation failed: {e}")
return False
def _stream_process_output(self, process: subprocess.Popen[str]) -> None:
"""Stream output from the running process."""
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
# Filter progress updates for cleaner output
line = output.strip()
if line and not line.startswith("["):
logger.info(f" {line}")
def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool:
"""Handle completion of the imatrix generation process.
Returns:
True if process completed successfully and output exists, False otherwise.
"""
return_code = process.poll()
if return_code != 0:
logger.error(f"❌ Imatrix generation failed with return code {return_code}")
return False
if not output_path.exists():
logger.error("Generation completed but output file not found")
return False
size_mb = output_path.stat().st_size / (1024 * 1024)
logger.info(f"✅ Generated imatrix: {output_path.name} ({size_mb:.1f} MB)")
return True
def prompt_for_generation(
self,
model_source: ModelSource,
model_dir: Path,
f16_model_path: Path,
) -> Path | None:
"""Prompt user to generate imatrix.
Args:
model_source: Model source information.
model_dir: Model directory.
f16_model_path: Path to F16 model.
Returns:
Path to generated imatrix or None if skipped.
"""
if not self.can_generate():
logger.info("⚠️ Imatrix generation not available (missing binary or calibration data)")
return None
logger.info("\n" + "=" * 70)
logger.info("📊 Importance Matrix Generation")
logger.info("=" * 70)
logger.info(
"\nImportance matrices improve quantisation quality by identifying"
"\ncritical weights in the model. This process takes 5-10 minutes"
"\nbut significantly improves the quality of smaller quantisations."
)
logger.info(f"\nModel: {model_source.model_name}")
logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}")
response = input("\n❓ Generate importance matrix? (Y/n): ").strip().lower()
if response == "n":
logger.info("Skipping imatrix generation")
return None
# Generate imatrix
output_path = model_dir / "imatrix.dat"
logger.info("\n⏳ Generating importance matrix (this may take 5-10 minutes)...")
if self.generate_imatrix(f16_model_path, output_path):
return output_path
logger.warning("Failed to generate imatrix, continuing without it")
return None

View file

@ -1,82 +1,294 @@
"""Importance matrix (imatrix) management service.
"""Direct llama.cpp binary execution service.
Manages detection and use of existing importance matrix files for
quantisation guidance. Provides user prompts for supplying pre-computed
imatrix files from external sources.
Provides direct execution of llama.cpp quantisation binary with proper
tensor-specific override support for L and XL variants.
"""
from __future__ import annotations
import os
import platform
import subprocess
from pathlib import Path
from typing import TYPE_CHECKING
from helpers.logger import logger
from helpers.services.binary_manager import BinaryManager
from helpers.services.filesystem import FilesystemService
if TYPE_CHECKING:
from pathlib import Path
from helpers.models.quantisation import QuantisationConfig
class IMatrixManager:
"""Handles importance matrix file management for quantisation.
class QuantisationExecutor:
"""Executes llama.cpp quantisation with tensor overrides.
Locates existing importance matrix files or prompts users to provide
pre-computed matrices from external sources. These matrices guide
quantisation decisions to preserve model quality.
Provides direct binary execution with proper command-line flags for
tensor-specific overrides, supporting Bartowski-style L and XL variants.
"""
def __init__(self) -> None:
"""Initialise IMatrixManager."""
"""Initialise quantisation executor."""
self.fs = FilesystemService()
self.binary_manager = BinaryManager()
self.quantise_binary = self._get_quantise_binary()
self.last_error: str | None = None # Track last error type
def _get_quantise_binary(self) -> Path | None:
"""Get llama-quantize binary, downloading if necessary.
Returns:
Path to binary if found, None otherwise.
"""
# First check local directory for manual placement
local_binary = Path("./llama-quantize")
if local_binary.exists():
logger.info(f"Using local llama-quantize binary: {local_binary}")
return local_binary
# Download from GitHub releases
binary_path = self.binary_manager.get_quantise_binary()
if binary_path and self.binary_manager.check_binary_works(binary_path):
logger.info(f"Using llama-quantize binary: {binary_path}")
return binary_path
logger.error("Failed to obtain llama-quantize binary")
logger.info(
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
)
return None
def execute_quantisation(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None = None,
) -> bool:
"""Execute quantisation using llama.cpp binary.
Builds and executes llama-quantize command with proper tensor override
flags for L and XL variants.
Returns:
True if quantisation successful, False otherwise.
"""
if not self.quantise_binary:
logger.error("llama-quantize binary not available")
return False
# Build command
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
# Execute with real-time output
return self._execute_command(cmd)
def _build_quantisation_command(
self,
input_path: Path,
output_path: Path,
config: QuantisationConfig,
imatrix_path: Path | None,
) -> list[str]:
"""Build llama-quantize command with tensor overrides.
Returns:
Command arguments as list.
"""
cmd = [str(self.quantise_binary)]
# Add imatrix if available
if imatrix_path:
cmd.extend(["--imatrix", str(imatrix_path)])
if imatrix_path.exists():
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
# Add tensor-specific overrides for L and XL variants
if config.embedding_type:
# Use directly from config - already in correct format
cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
if config.output_type:
# Use directly from config - already in correct format
cmd.extend(["--output-tensor-type", config.output_type.lower()])
logger.info(f"⚙️ Output tensor type: {config.output_type}")
# Note: Per-layer tensor overrides could be added here if needed in future
# For now, embedding and output overrides handle the L/XL variants
# Get base quantisation type
base_quant = self._get_base_quantisation_type(config.name)
# Add input, output, and base quantisation type
cmd.extend([str(input_path), str(output_path), base_quant])
return cmd
def _get_base_quantisation_type(self, config_name: str) -> str:
"""Get base quantisation type for a config.
Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).
Returns:
Base quantisation type string.
"""
# Mapping of custom variants to base types
variant_mapping = {
"Q3_K_L": "Q3_K_M",
"Q3_K_XL": "Q3_K_M",
"Q4_K_L": "Q4_K_M",
"Q4_K_XL": "Q4_K_M",
"Q5_K_L": "Q5_K_M",
"Q5_K_XL": "Q5_K_M",
"Q6_K_L": "Q6_K",
"Q6_K_XL": "Q6_K",
}
return variant_mapping.get(config_name, config_name)
def _execute_command(self, cmd: list[str]) -> bool:
"""Execute command with real-time output streaming.
Returns:
True if successful, False otherwise.
"""
logger.info(f"💻 Running: {' '.join(cmd)}")
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
# Set LD_LIBRARY_PATH for shared libraries
env = os.environ.copy()
if platform.system() != "Windows":
lib_path = str(self.binary_manager.BINARY_DIR)
if "LD_LIBRARY_PATH" in env:
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
else:
env["LD_LIBRARY_PATH"] = lib_path
# Track output for architecture detection
output_lines = []
architecture_error = False
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1,
env=env,
)
# Stream output
while True:
if process.stdout is not None:
output = process.stdout.readline()
else:
break
if not output and process.poll() is not None:
break
if output:
output_stripped = output.strip()
logger.info(f"📊 {output_stripped}")
output_lines.append(output_stripped)
# Check for architecture-related errors
if any(
phrase in output_stripped.lower()
for phrase in [
"unsupported architecture",
"unknown architecture",
"architecture not supported",
"model architecture",
"llama_model_load: error loading model",
]
):
architecture_error = True
return_code = process.poll()
if return_code == 0:
logger.info("✅ Quantisation successful!")
return True
# Check if this was an architecture error
if architecture_error or return_code == 1:
# Look for architecture info in recent output
for line in output_lines[-10:]: # Check last 10 lines
if "architecture" in line.lower():
logger.error("❌ Architecture not supported by llama.cpp")
logger.error(" so cannot be quantised with current llama.cpp but")
logger.error(" F16 GGUF file can be used for inference if supported")
# Store this for the orchestrator to detect
self.last_error = "unsupported_architecture"
return False
logger.error(f"❌ Quantisation failed with return code {return_code}")
except Exception as e:
logger.error(f"❌ Quantisation failed with exception: {e}")
return False
else:
return False
class IMatrixHandler:
"""Handles importance matrix file management.
Manages detection and use of existing importance matrix files for
quantisation guidance.
"""
def __init__(self) -> None:
"""Initialise IMatrixHandler."""
self.fs = FilesystemService()
def find_imatrix(self, model_dir: Path) -> Path | None:
"""Find or prompt for importance matrix file.
Searches for existing imatrix files first, then provides interactive
prompts for user-supplied matrices. See docs/imatrix_data.md for
instructions on generating imatrix files.
"""Find existing imatrix file in model directory.
Returns:
Path to imatrix file, or None if not available.
Path to imatrix file if found, None otherwise.
"""
imatrix_path = model_dir / "imatrix.dat"
# Check for existing imatrix
if imatrix_path.exists():
logger.info(f"Found existing imatrix: {imatrix_path.name}")
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
return imatrix_path
# Try user-provided imatrix
return self._prompt_for_user_imatrix(model_dir, imatrix_path)
return None
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
"""Prompt user for existing imatrix file.
Returns:
Path to user-provided imatrix, or None if not available.
"""
imatrix_path = model_dir / "imatrix.dat"
logger.info(f"Model directory: {model_dir}")
logger.info(f"Looking for imatrix file at: {imatrix_path}")
logger.info("\n" + "=" * 70)
logger.info("📊 No existing imatrix file found")
logger.info("\nYou have two options:")
logger.info(" 1. Provide a pre-computed imatrix file")
logger.info(" (💡 see docs/imatrix_data.md to generate your own)")
logger.info(" 2. Skip imatrix usage (lower quality quantisation)")
logger.info("=" * 70)
logger.info(
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
)
logger.info(
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
)
response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
response = (
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
.strip()
.lower()
)
if response != "y":
logger.info("Continuing without imatrix (quantisation quality may be lower)")
logger.info(" See docs/imatrix_data.md for instructions on generating imatrix files") # noqa: RUF001
return None
logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
if imatrix_path.exists():
file_size = self.fs.get_file_size(imatrix_path)
logger.info(f"✅ Found imatrix file! ({file_size})")
logger.info(f"Found imatrix file! ({file_size})")
return imatrix_path
logger.warning("No imatrix.dat file found - continuing without imatrix")

View file

@ -86,8 +86,8 @@ class LlamaCppPythonAPI:
raise RuntimeError(msg)
# Normalise the config name to extract base type
# E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
# E.g., "Q4_K_M_XXL" -> "Q4_K_M"
# e.g. "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
# e.g. "Q4_K_M_XXL" -> "Q4_K_M"
config_upper = config_name.upper()
# Direct mapping for exact matches
@ -224,7 +224,7 @@ class LlamaCppPythonAPI:
Args:
input_path: Path to input GGUF model.
output_path: Path for output quantised model.
base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
base_type: Base quantisation type (e.g. "Q4_K_M", "Q6_K").
embedding_type: Override for token embeddings (None = use base).
output_type: Override for output/lm_head layers (None = use base).
imatrix_path: Optional importance matrix file.
@ -470,7 +470,7 @@ class LlamaCppPythonAPI:
"""Log current resource usage state.
Args:
phase: Description of current phase (e.g., "before", "after").
phase: Description of current phase (e.g. "before", "after").
Returns:
Current memory usage in GB.

View file

@ -31,12 +31,14 @@ from helpers.models.quantisation import (
QuantisationType,
)
from helpers.services.huggingface import ReadmeGenerator
from helpers.services.llama_cpp import IMatrixManager
from helpers.services.imatrix_generator import IMatrixGenerator
from helpers.services.llama_cpp import IMatrixHandler
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
from helpers.utils.tensor_mapping import URLParser
if TYPE_CHECKING:
from types import FrameType
from typing import Any
@dataclass(slots=True)
@ -55,7 +57,8 @@ class QuantisationOrchestrator:
# Service dependencies with factory defaults
url_parser: URLParser = field(default_factory=URLParser)
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
@ -172,18 +175,28 @@ class QuantisationOrchestrator:
self.models_dir.mkdir(parents=True, exist_ok=True)
f16_model_path = self.model_manager.prepare_model(model_source)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
imatrix_path = self.imatrix_manager.find_imatrix(
self.models_dir / model_source.model_name
)
output_repo = (
f"{self.uploader.get_username()}/"
f"{model_source.original_author}-{model_source.model_name}-GGUF"
)
imatrix_path = None
if self.use_imatrix:
logger.info("Checking for importance matrix (imatrix)...")
model_dir = self.models_dir / model_source.model_name
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
# If no imatrix found, offer to generate or provide one
if not imatrix_path:
# First offer to generate
imatrix_path = self.imatrix_generator.prompt_for_generation(
model_source, model_dir, f16_model_path
)
# If generation was skipped, offer to provide existing one
if not imatrix_path:
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
return model_source, f16_model_path, imatrix_path, output_repo
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
@ -222,10 +235,63 @@ class QuantisationOrchestrator:
types_list = [qt.value for qt in quantisation_types]
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
# Track F16 in results for status display (if we converted from SafeTensors)
if not model_source.is_gguf_repo:
# Get F16 file size
f16_size = "-"
if f16_model_path.exists():
size_bytes = f16_model_path.stat().st_size
size_gb = size_bytes / (1024**3)
f16_size = f"{size_gb:.1f}GB"
# Create a simple object for F16 tracking (not a QuantisationResult)
# since F16 isn't a quantisation type in our enum
f16_result = type(
"F16Result",
(),
{
"quantisation_type": "F16",
"success": True,
"status": "planned",
"file_path": f16_model_path,
"file_size": f16_size,
},
)()
results["F16"] = f16_result
# Process with parallel uploads - quantise sequentially but upload in background
upload_futures = []
upload_futures: list[Any] = []
architecture_unsupported = False
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
# Start F16 upload first if we have one
if not model_source.is_gguf_repo and not self.no_upload and "F16" in results:
f16_result = results["F16"]
if f16_result.file_path and f16_result.file_path.exists():
logger.info("Starting parallel upload of F16 GGUF...")
f16_result.status = "uploading"
self._update_readme_status(model_source, results, output_repo)
upload_future = upload_executor.submit(
self._upload_f16_and_cleanup,
output_repo,
f16_result.file_path,
model_source,
results,
)
upload_futures.append(upload_future)
for i, quant_type in enumerate(quantisation_types, 1):
# Skip remaining quantisations if architecture is unsupported
if architecture_unsupported:
logger.info(f"Skipping {quant_type.value} - architecture not supported")
results[quant_type] = QuantisationResult(
quantisation_type=quant_type,
success=False,
status="failed",
error_message="Architecture not supported by llama.cpp",
)
continue
logger.info(
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
)
@ -247,6 +313,30 @@ class QuantisationOrchestrator:
results[quant_type] = result
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
# Check if this failed due to unsupported architecture
if (
not result.success
and hasattr(self.quantisation_engine.executor, "last_error")
and self.quantisation_engine.executor.last_error
== "unsupported_architecture"
):
logger.warning(
"Architecture not supported - skipping remaining quantisations"
)
architecture_unsupported = True
# Update the current result to also show as skipped
result.error_message = "Architecture not supported by llama.cpp"
# Update README immediately to show remaining quantizations as skipped
for remaining_quant_type in quantisation_types[i:]:
if remaining_quant_type not in results:
results[remaining_quant_type] = QuantisationResult(
quantisation_type=remaining_quant_type,
success=False,
status="failed",
error_message="Architecture not supported by llama.cpp",
)
self._update_readme_status(model_source, results, output_repo)
# Force cleanup between quantisations
gc.collect()
logger.debug("DEBUG: Garbage collection completed")
@ -269,6 +359,14 @@ class QuantisationOrchestrator:
# Wait for all uploads to complete before returning
self._wait_for_uploads(upload_futures)
# Final README update to ensure all statuses are accurate
if not self.no_upload and upload_futures:
logger.info("Updating README with final status...")
final_readme = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, final_readme)
return results
def _process_single_quantisation(
@ -505,12 +603,26 @@ class QuantisationOrchestrator:
def _wait_for_uploads(self, upload_futures: list) -> None:
"""Wait for all parallel uploads to complete."""
logger.info("Waiting for any remaining uploads to complete...")
if not upload_futures:
return
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
completed = 0
failed = 0
for future in upload_futures:
try:
future.result(timeout=300) # 5 minute timeout per upload
completed += 1
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
except Exception as e:
logger.warning(f"Upload error: {e}")
failed += 1
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
if failed > 0:
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
else:
logger.info(f"All {completed} uploads completed successfully")
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
"""Clean up temporary files after processing."""
@ -573,6 +685,45 @@ class QuantisationOrchestrator:
)
# Don't re-raise - let other uploads continue
def _upload_f16_and_cleanup(
self,
output_repo: str,
file_path: Path,
model_source: ModelSource,
results: dict[str, QuantisationResult],
) -> None:
"""Upload F16 file and clean up (runs in background thread)."""
try:
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
self.uploader.upload_model_file(output_repo, file_path)
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
# Don't delete F16 yet - we still need it for quantisations
# It will be deleted in _cleanup_files after all quantisations complete
results["F16"].status = "completed"
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
logger.info("[PARALLEL] F16 upload complete")
except Exception as e:
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
results["F16"].status = "failed"
results["F16"].error_message = str(e)
try:
updated_readme_path = self.readme_generator.generate(
model_source, results, self.models_dir, output_repo
)
self.uploader.upload_readme(output_repo, updated_readme_path)
except Exception as readme_error:
logger.error(
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
)
# Don't re-raise - let other uploads continue
def _print_model_info(self, model_source: ModelSource) -> None:
"""Print model information."""
logger.info(f"Source URL: {model_source.url}")

View file

@ -22,7 +22,7 @@ from helpers.models.quantisation import (
)
from helpers.services.filesystem import FilesystemService
from helpers.services.gguf import GGUFConverter
from helpers.services.llama_python import LlamaCppPythonAPI
from helpers.services.llama_cpp import QuantisationExecutor
from helpers.utils.config_parser import ConfigParser
from helpers.utils.tensor_mapping import TensorMapper
@ -32,30 +32,28 @@ class QuantisationEngine:
Provides flexible quantisation execution supporting multiple tensor
precision configurations, importance matrices, and fallback strategies.
Uses llama-cpp-python API for direct quantisation without subprocess overhead.
Uses direct llama.cpp binary execution with proper tensor overrides.
"""
def __init__(self) -> None:
"""Initialise quantisation engine."""
self.fs = FilesystemService()
self.python_api = LlamaCppPythonAPI()
self.executor = QuantisationExecutor()
def quantise(self, context: QuantisationContext) -> QuantisationResult:
"""Perform quantisation using the specified configuration.
Executes quantisation using Python API. Since llama-cpp-python is a
required dependency, we can rely on it being available.
Executes quantisation using direct llama.cpp binary with proper
tensor override flags for L and XL variants.
Returns:
QuantisationResult with success status and file information.
"""
logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
logger.info(
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
)
output_path = context.get_output_path()
logger.debug(f"DEBUG: Output path: {output_path}")
# Check input file exists and is readable
if not context.f16_model_path.exists():
@ -67,34 +65,20 @@ class QuantisationEngine:
error_message=error_msg,
)
# Check if we have enough disk space (rough estimate)
try:
input_size = context.f16_model_path.stat().st_size
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
# This is a rough check - actual available space calculation is more complex
logger.debug(f"DEBUG: Output directory: {output_path.parent}")
except Exception as e:
logger.warning(f"⚠️ Could not check disk space: {e}")
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
logger.debug(f"DEBUG: Source: {context.f16_model_path}")
logger.debug(f"DEBUG: Target: {output_path}")
logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
logger.info(f"📝 Source: {context.f16_model_path}")
logger.info(f"📝 Target: {output_path}")
try:
# Use Python API for quantisation
logger.info("🐍 Using Python API for quantisation...")
logger.debug("DEBUG: Calling python_api.quantise_model...")
# Use direct binary execution for quantisation
logger.info("🔧 Using llama.cpp binary for quantisation...")
success = self.python_api.quantise_model(
success = self.executor.execute_quantisation(
context.f16_model_path, output_path, context.config, context.imatrix_path
)
logger.debug(f"DEBUG: Python API returned: {success}")
if success:
logger.debug("DEBUG: Quantisation successful, creating success result")
return self._create_success_result(context.config.name, output_path, "Python API")
return self._create_success_result(context.config.name, output_path, "llama.cpp")
logger.error(f"{context.config.name} quantisation failed")
return QuantisationResult(
@ -175,7 +159,7 @@ class ModelManager:
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
@ -339,9 +323,17 @@ class ModelManager:
Raises:
RuntimeError: If download fails.
"""
# Ensure the model directory and .huggingface subdirectory exist
model_dir.mkdir(parents=True, exist_ok=True)
huggingface_dir = model_dir / ".huggingface"
huggingface_dir.mkdir(parents=True, exist_ok=True)
try:
logger.debug(f"DEBUG: Downloading full repository: {source_model}")
result = subprocess.run(
logger.info(f"⬇️ Downloading full repository: {source_model}")
logger.info("📊 Progress will be shown below...")
# Use subprocess.Popen to stream output in real-time
process = subprocess.Popen(
[
"huggingface-cli",
"download",
@ -349,13 +341,34 @@ class ModelManager:
"--local-dir",
str(model_dir),
],
check=True,
capture_output=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1, # Line buffered
universal_newlines=True,
)
logger.debug(
f"DEBUG: Repository download completed with return code {result.returncode}"
)
# Stream output line by line
for line in process.stdout:
# Log download progress lines
if line.strip():
# Check if it's a progress line (contains %)
if "%" in line or "Downloading" in line or "Fetching" in line:
# Use info level for progress lines
logger.info(f" {line.strip()}")
else:
# Use debug for other output
logger.debug(f" {line.strip()}")
# Wait for process to complete
return_code = process.wait()
if return_code != 0:
msg = f"Repository download failed with return code {return_code}"
raise RuntimeError(msg)
logger.info("✅ Repository download completed successfully")
except subprocess.CalledProcessError as e:
logger.error(f"❌ Failed to download repository {source_model}")
logger.error(f"Return code: {e.returncode}")
@ -386,7 +399,7 @@ class ModelManager:
RuntimeError: If conversion fails.
"""
logger.info("🔄 Converting to GGUF F16 format...")
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
if f16_model.exists():
logger.info("✅ F16 model already exists")
@ -414,6 +427,28 @@ class ModelManager:
if arch != arch_name:
logger.info(f"📝 Architecture mapping: {arch_name}{arch}")
# Check if architecture is supported by llama.cpp
supported_archs = {
"llama",
"qwen2",
"gemma",
"phi3",
"falcon",
"gpt2",
"gptj",
"gptneox",
"mpt",
"baichuan",
"stablelm",
}
if arch not in supported_archs:
logger.warning("=" * 70)
logger.warning(f"⚠️ Architecture '{arch_name}' may not be supported by llama.cpp")
logger.warning(f"⚠️ The GGUF will be created with architecture: '{arch}'")
logger.warning("⚠️ Check if your inference software supports this architecture.")
logger.warning("=" * 70)
# Convert using GGUFConverter
tensor_mapper = TensorMapper()
success = GGUFConverter.convert_safetensors(

View file

@ -107,28 +107,44 @@ class ConfigParser:
@staticmethod
def get_architecture_mapping(architecture: str) -> str:
"""Map architecture names to known GGUF architectures.
"""Get the GGUF architecture name for a model.
Provides fallback mappings for architectures not directly supported
by GGUF format, translating them to similar known architectures. This
enables broader model compatibility whilst maintaining GGUF standards.
Returns the original architecture name to preserve model identity.
Only maps architectures that are truly compatible.
Returns:
GGUF-compatible architecture name with appropriate fallback to llama.
Architecture name for GGUF, preserving original when possible.
"""
# Architecture mappings to known GGUF types
mappings = {
"DotsOCRForCausalLM": "qwen2", # Similar architecture
"GptOssForCausalLM": "llama", # Use llama as fallback
"MistralForCausalLM": "llama", # Mistral is llama-like
"Qwen2ForCausalLM": "qwen2",
# Only map architectures that are ACTUALLY the same
# DO NOT map incompatible architectures
known_compatible = {
"LlamaForCausalLM": "llama",
"MistralForCausalLM": "llama", # Mistral IS llama-compatible
"Qwen2ForCausalLM": "qwen2",
"GemmaForCausalLM": "gemma",
"Phi3ForCausalLM": "phi3",
# Add more mappings as needed
"FalconForCausalLM": "falcon",
"GPT2LMHeadModel": "gpt2",
"GPTJForCausalLM": "gptj",
"GPTNeoXForCausalLM": "gptneox",
"MPTForCausalLM": "mpt",
"BaichuanForCausalLM": "baichuan",
"StableLMEpochForCausalLM": "stablelm",
}
return mappings.get(architecture, "llama") # Default to llama
if architecture in known_compatible:
return known_compatible[architecture]
# For unknown architectures, preserve the original name
# This will make it clear the model needs proper support
# Remove common suffixes to get cleaner architecture name
arch_name = architecture
for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]:
if arch_name.endswith(suffix):
arch_name = arch_name[: -len(suffix)]
break
return arch_name.lower()
@staticmethod
def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
@ -155,11 +171,33 @@ class ConfigParser:
config = fs.load_json_config(tokeniser_config_path)
# Extract token IDs with defaults
# Try to find special token IDs from added_tokens_decoder
added_tokens = config.get("added_tokens_decoder", {})
eos_token_id = config.get("eos_token_id")
bos_token_id = config.get("bos_token_id")
# If not directly specified, search in added_tokens_decoder
if eos_token_id is None:
for token_id, token_info in added_tokens.items():
if token_info.get("content") == "<|endoftext|>":
eos_token_id = int(token_id)
break
if bos_token_id is None:
for token_id, token_info in added_tokens.items():
if token_info.get("content") in {"<|im_start|>", "<s>", "<|startoftext|>"}:
bos_token_id = int(token_id)
break
# Extract token IDs with better defaults
return {
"bos_token_id": config.get("bos_token_id", 1),
"eos_token_id": config.get("eos_token_id", 2),
"bos_token_id": bos_token_id if bos_token_id is not None else 1,
"eos_token_id": eos_token_id if eos_token_id is not None else 2,
"unk_token_id": config.get("unk_token_id", 0),
"pad_token_id": config.get("pad_token_id", 0),
"pad_token_id": config.get(
"pad_token_id", eos_token_id if eos_token_id is not None else 0
),
"model_type": config.get("model_type", "llama"),
"add_bos_token": config.get("add_bos_token", True),
"add_eos_token": config.get("add_eos_token", False),
}

40
uv.lock generated
View file

@ -496,26 +496,26 @@ wheels = [
[[package]]
name = "uv"
version = "0.8.6"
version = "0.8.8"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
{ url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
{ url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
{ url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
{ url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
{ url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
{ url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
{ url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
{ url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
{ url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
{ url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
{ url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
{ url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
{ url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
{ url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
{ url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
{ url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
{ url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
{ url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" },
{ url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" },
{ url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" },
{ url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" },
{ url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" },
{ url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" },
{ url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" },
{ url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" },
{ url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" },
{ url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" },
{ url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" },
{ url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" },
{ url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" },
{ url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" },
{ url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" },
{ url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" },
{ url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" },
{ url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" },
]