Use proper binaries
This commit is contained in:
parent
d937f2d5fa
commit
633efdc305
13 changed files with 1709 additions and 163 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -58,3 +58,4 @@ venv.bak/
|
|||
# Working directories
|
||||
work/
|
||||
quantisation_work/
|
||||
.cache/
|
||||
|
|
|
@ -46,15 +46,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
|
||||
base_type="Q3_K_M",
|
||||
base_precision=3,
|
||||
output_type="Q5_K",
|
||||
output_type="q5_k",
|
||||
),
|
||||
QuantisationType.Q3_K_XL: QuantisationConfig(
|
||||
name="Q3_K_XL",
|
||||
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
|
||||
base_type="Q3_K_M",
|
||||
base_precision=3,
|
||||
embedding_type="Q8_0",
|
||||
output_type="Q6_K",
|
||||
embedding_type="q8_0",
|
||||
output_type="q6_k",
|
||||
),
|
||||
QuantisationType.Q4_K_S: QuantisationConfig(
|
||||
name="Q4_K_S",
|
||||
|
@ -78,7 +78,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
|
||||
base_type="Q4_K_M",
|
||||
base_precision=4,
|
||||
embedding_type="Q8_0",
|
||||
embedding_type="q8_0",
|
||||
),
|
||||
# Additional standard quantisation profiles
|
||||
QuantisationType.Q5_K_S: QuantisationConfig(
|
||||
|
@ -103,7 +103,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
|
||||
base_type="Q5_K_M",
|
||||
base_precision=5,
|
||||
embedding_type="Q8_0",
|
||||
embedding_type="q8_0",
|
||||
),
|
||||
QuantisationType.Q6_K: QuantisationConfig(
|
||||
name="Q6_K",
|
||||
|
@ -121,7 +121,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|||
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
|
||||
base_type="Q6_K",
|
||||
base_precision=6,
|
||||
output_type="Q8_0",
|
||||
output_type="q8_0",
|
||||
),
|
||||
QuantisationType.Q8_0: QuantisationConfig(
|
||||
name="Q8_0",
|
||||
|
|
491
helpers/services/binary_manager.py
Normal file
491
helpers/services/binary_manager.py
Normal file
|
@ -0,0 +1,491 @@
|
|||
"""Binary manager for llama.cpp releases.
|
||||
|
||||
Downloads and manages llama.cpp binary releases from GitHub, handling
|
||||
platform detection, version checking, and caching.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import tarfile
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, ClassVar
|
||||
from urllib.request import urlopen, urlretrieve
|
||||
|
||||
from helpers.logger import logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
|
||||
|
||||
class BinaryManager:
|
||||
"""Manages llama.cpp binary downloads and updates.
|
||||
|
||||
Automatically downloads appropriate llama.cpp releases based on platform,
|
||||
caches binaries locally, and checks for updates from GitHub releases.
|
||||
"""
|
||||
|
||||
GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest"
|
||||
# Use local .cache directory in project
|
||||
BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries"
|
||||
|
||||
# Platform mappings to release asset patterns
|
||||
PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = {
|
||||
("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"],
|
||||
("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"],
|
||||
("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"],
|
||||
("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"],
|
||||
("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"],
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise binary manager."""
|
||||
self.BINARY_DIR.mkdir(parents=True, exist_ok=True)
|
||||
self.version_file = self.BINARY_DIR / "version.json"
|
||||
self.quantize_binary_path = self._get_binary_path("llama-quantize")
|
||||
self.imatrix_binary_path = self._get_binary_path("llama-imatrix")
|
||||
|
||||
def _get_binary_path(self, base_name: str) -> Path:
|
||||
"""Get path to binary.
|
||||
|
||||
Args:
|
||||
base_name: Base name of binary (without extension).
|
||||
|
||||
Returns:
|
||||
Path where binary should be located.
|
||||
"""
|
||||
binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name
|
||||
return self.BINARY_DIR / binary_name
|
||||
|
||||
def get_quantise_binary(self) -> Path | None:
|
||||
"""Get llama-quantize binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
return self._get_binary("llama-quantize", self.quantize_binary_path)
|
||||
|
||||
def get_imatrix_binary(self) -> Path | None:
|
||||
"""Get llama-imatrix binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
return self._get_binary("llama-imatrix", self.imatrix_binary_path)
|
||||
|
||||
def _get_binary(self, name: str, binary_path: Path) -> Path | None:
|
||||
"""Get a specific binary, downloading if necessary.
|
||||
|
||||
Args:
|
||||
name: Name of the binary.
|
||||
binary_path: Path where binary should be located.
|
||||
|
||||
Returns:
|
||||
Path to binary if available, None if download fails.
|
||||
"""
|
||||
# Check if we have a binary and if it needs updating
|
||||
if self._should_update():
|
||||
logger.info("🔄 Checking for llama.cpp updates...")
|
||||
if not self._download_latest():
|
||||
logger.warning("Failed to download latest llama.cpp release")
|
||||
# Fall back to existing binary if available
|
||||
if binary_path.exists():
|
||||
logger.info(f"Using existing {name} binary")
|
||||
return binary_path
|
||||
return None
|
||||
|
||||
if binary_path.exists():
|
||||
return binary_path
|
||||
|
||||
logger.info("📥 Downloading llama.cpp binaries...")
|
||||
if self._download_latest():
|
||||
return binary_path
|
||||
|
||||
return None
|
||||
|
||||
def _should_update(self) -> bool:
|
||||
"""Check if binary needs updating.
|
||||
|
||||
Returns:
|
||||
True if update needed, False otherwise.
|
||||
"""
|
||||
# If no binaries exist, we need to download
|
||||
if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists():
|
||||
return True
|
||||
|
||||
# Check version file
|
||||
if not self.version_file.exists():
|
||||
return True
|
||||
|
||||
try:
|
||||
with Path(self.version_file).open(encoding="utf-8") as f:
|
||||
cached_version = json.load(f)
|
||||
|
||||
# Check if cached version is older than 7 days
|
||||
if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600:
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _download_latest(self) -> bool:
|
||||
"""Download latest llama.cpp release.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Get latest release info
|
||||
release_info = self._get_latest_release()
|
||||
if not release_info:
|
||||
return False
|
||||
|
||||
# Find appropriate asset for platform
|
||||
asset_url = self._find_platform_asset(release_info["assets"])
|
||||
if not asset_url:
|
||||
logger.warning("No suitable binary found for this platform")
|
||||
return False
|
||||
|
||||
# Download and extract
|
||||
logger.info(f"📥 Downloading from: {asset_url}")
|
||||
if not self._download_and_extract(asset_url):
|
||||
return False
|
||||
|
||||
# Save version info
|
||||
self._save_version_info(release_info)
|
||||
|
||||
logger.info("✅ Successfully downloaded llama.cpp binary")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download llama.cpp: {e}")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _get_latest_release(self) -> dict[str, Any] | None:
|
||||
"""Get latest release info from GitHub API.
|
||||
|
||||
Returns:
|
||||
Release info dict or None if failed.
|
||||
"""
|
||||
try:
|
||||
with urlopen(self.GITHUB_API) as response: # noqa: S310
|
||||
return json.loads(response.read())
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch release info: {e}")
|
||||
return None
|
||||
|
||||
def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None:
|
||||
"""Find appropriate asset for current platform.
|
||||
|
||||
Returns:
|
||||
Download URL for appropriate asset or None.
|
||||
"""
|
||||
patterns = self._get_platform_patterns()
|
||||
if not patterns:
|
||||
return None
|
||||
|
||||
return self._select_best_asset(assets, patterns)
|
||||
|
||||
def _get_platform_patterns(self) -> list[str]:
|
||||
"""Get platform patterns for current system.
|
||||
|
||||
Returns:
|
||||
List of patterns to match in asset names.
|
||||
"""
|
||||
system = platform.system()
|
||||
machine = platform.machine()
|
||||
|
||||
# Get specific patterns for this platform
|
||||
patterns = self.PLATFORM_PATTERNS.get((system, machine), [])
|
||||
if patterns:
|
||||
return patterns
|
||||
|
||||
# Fall back to generic patterns
|
||||
generic_patterns = {
|
||||
"Linux": ["linux", "ubuntu"],
|
||||
"Darwin": ["macos", "darwin"],
|
||||
"Windows": ["win", "windows"],
|
||||
}
|
||||
return generic_patterns.get(system, [])
|
||||
|
||||
def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None:
|
||||
"""Select the best asset from available options.
|
||||
|
||||
Returns:
|
||||
Download URL for best matching asset or None.
|
||||
"""
|
||||
avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"]
|
||||
prefer_patterns = ["cpu", "vulkan", "avx2", "avx"]
|
||||
|
||||
best_asset = None
|
||||
best_score = -1
|
||||
|
||||
for asset in assets:
|
||||
name = asset["name"].lower()
|
||||
|
||||
# Skip GPU-specific builds
|
||||
if any(pattern in name for pattern in avoid_patterns):
|
||||
continue
|
||||
|
||||
# Check platform match
|
||||
if not any(pattern in name for pattern in patterns):
|
||||
continue
|
||||
|
||||
score = self._score_asset(name, patterns, prefer_patterns)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_asset = asset
|
||||
|
||||
return best_asset["browser_download_url"] if best_asset else None
|
||||
|
||||
def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int:
|
||||
"""Score an asset based on platform and preference matching.
|
||||
|
||||
Returns:
|
||||
Numeric score for asset quality (higher is better).
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Platform match bonus
|
||||
if any(pattern in name for pattern in patterns):
|
||||
score += 10
|
||||
|
||||
# Preference bonuses
|
||||
for pattern in prefer_patterns:
|
||||
if pattern in name:
|
||||
score += 5
|
||||
|
||||
# Archive format preference
|
||||
system = platform.system()
|
||||
if (system == "Windows" and name.endswith(".zip")) or (
|
||||
system != "Windows" and name.endswith(".tar.gz")
|
||||
):
|
||||
score += 2
|
||||
|
||||
return score
|
||||
|
||||
def _download_and_extract(self, url: str) -> bool:
|
||||
"""Download and extract binary archive.
|
||||
|
||||
Args:
|
||||
url: Download URL for archive.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
try:
|
||||
# Download to temp file
|
||||
temp_file = self.BINARY_DIR / "temp_download"
|
||||
logger.info("⬇️ Downloading archive...")
|
||||
urlretrieve(url, temp_file) # noqa: S310
|
||||
|
||||
# Extract based on file type
|
||||
if url.endswith(".zip"):
|
||||
with zipfile.ZipFile(temp_file, "r") as zf:
|
||||
self._extract_binary_from_archive(zf)
|
||||
elif url.endswith((".tar.gz", ".tgz")):
|
||||
with tarfile.open(temp_file, "r:gz") as tf:
|
||||
self._extract_binary_from_archive(tf)
|
||||
else:
|
||||
logger.error(f"Unknown archive format: {url}")
|
||||
return False
|
||||
|
||||
# Clean up temp file
|
||||
temp_file.unlink()
|
||||
|
||||
# Make binaries executable on Unix
|
||||
if platform.system() != "Windows":
|
||||
self.quantize_binary_path.chmod(0o755)
|
||||
self.imatrix_binary_path.chmod(0o755)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download and extract: {e}")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _extract_binary_from_archive(self, archive: Any) -> None:
|
||||
"""Extract llama binaries and their dependencies from archive."""
|
||||
target_binaries = {
|
||||
"llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"],
|
||||
"llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"],
|
||||
}
|
||||
|
||||
# Also extract shared libraries
|
||||
shared_libs = [
|
||||
"libllama.so",
|
||||
"libggml-base.so",
|
||||
"libggml.so",
|
||||
"libllama.dll",
|
||||
"libggml.dll",
|
||||
]
|
||||
|
||||
members = self._get_archive_members(archive)
|
||||
extracted = self._extract_matching_binaries(archive, members, target_binaries)
|
||||
self._extract_shared_libraries(archive, members, shared_libs)
|
||||
self._cleanup_extracted_directories()
|
||||
self._report_missing_binaries(extracted)
|
||||
|
||||
def _get_archive_members(self, archive: Any) -> list[str]:
|
||||
"""Get list of members from archive.
|
||||
|
||||
Returns:
|
||||
List of member names in the archive.
|
||||
"""
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
return archive.namelist()
|
||||
return [m.name for m in archive.getmembers()]
|
||||
|
||||
def _extract_matching_binaries(
|
||||
self,
|
||||
archive: Any,
|
||||
members: list[str],
|
||||
target_binaries: dict[str, list[str]],
|
||||
) -> set[str]:
|
||||
"""Extract binaries that match target patterns.
|
||||
|
||||
Returns:
|
||||
Set of successfully extracted binary types.
|
||||
"""
|
||||
extracted = set()
|
||||
for member in members:
|
||||
base_name = Path(member).name
|
||||
|
||||
for binary_type, possible_names in target_binaries.items():
|
||||
if base_name in possible_names:
|
||||
self._extract_single_binary(archive, member, binary_type)
|
||||
extracted.add(binary_type)
|
||||
break
|
||||
return extracted
|
||||
|
||||
def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None:
|
||||
"""Extract a single binary from archive."""
|
||||
logger.info(f"📦 Extracting {Path(member).name} as {binary_type}...")
|
||||
target_path = self._get_binary_path(binary_type)
|
||||
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
self._extract_from_zip(archive, member, target_path)
|
||||
else: # tarfile
|
||||
self._extract_from_tar(archive, member, target_path)
|
||||
|
||||
def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None:
|
||||
"""Extract binary from zip archive."""
|
||||
temp_path = self.BINARY_DIR / "temp_binary"
|
||||
with archive.open(member) as source, temp_path.open("wb") as target:
|
||||
shutil.copyfileobj(source, target)
|
||||
shutil.move(str(temp_path), str(target_path))
|
||||
|
||||
def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None:
|
||||
"""Extract binary from tar archive."""
|
||||
archive.extract(member, self.BINARY_DIR)
|
||||
extracted_path = self.BINARY_DIR / member
|
||||
if extracted_path != target_path:
|
||||
shutil.move(str(extracted_path), str(target_path))
|
||||
|
||||
def _cleanup_extracted_directories(self) -> None:
|
||||
"""Clean up any extracted directories."""
|
||||
for item in self.BINARY_DIR.iterdir():
|
||||
if item.is_dir() and item.name != "binaries":
|
||||
shutil.rmtree(item)
|
||||
|
||||
def _extract_shared_libraries(
|
||||
self, archive: Any, members: list[str], lib_patterns: list[str]
|
||||
) -> None:
|
||||
"""Extract shared libraries needed by the binaries.
|
||||
|
||||
Args:
|
||||
archive: The archive object.
|
||||
members: List of all archive members.
|
||||
lib_patterns: Patterns to match for library files.
|
||||
"""
|
||||
for member in members:
|
||||
base_name = Path(member).name
|
||||
if any(lib in base_name for lib in lib_patterns):
|
||||
logger.info(f"📚 Extracting library: {base_name}")
|
||||
target_path = self.BINARY_DIR / base_name
|
||||
|
||||
if isinstance(archive, zipfile.ZipFile):
|
||||
temp_path = self.BINARY_DIR / "temp_lib"
|
||||
with archive.open(member) as source, temp_path.open("wb") as target:
|
||||
shutil.copyfileobj(source, target)
|
||||
shutil.move(str(temp_path), str(target_path))
|
||||
else: # tarfile
|
||||
archive.extract(member, self.BINARY_DIR)
|
||||
extracted_path = self.BINARY_DIR / member
|
||||
if extracted_path != target_path:
|
||||
shutil.move(str(extracted_path), str(target_path))
|
||||
|
||||
# Make libraries executable on Unix
|
||||
if platform.system() != "Windows":
|
||||
target_path.chmod(0o755)
|
||||
|
||||
def _report_missing_binaries(self, extracted: set[str]) -> None:
|
||||
"""Report any missing binaries."""
|
||||
if "llama-quantize" not in extracted:
|
||||
logger.warning("llama-quantize binary not found in archive")
|
||||
if "llama-imatrix" not in extracted:
|
||||
logger.warning("llama-imatrix binary not found in archive")
|
||||
|
||||
def _save_version_info(self, release_info: dict[str, Any]) -> None:
|
||||
"""Save version information to cache.
|
||||
|
||||
Args:
|
||||
release_info: GitHub release information.
|
||||
"""
|
||||
version_data = {
|
||||
"version": release_info.get("tag_name", "unknown"),
|
||||
"timestamp": time.time(),
|
||||
"url": release_info.get("html_url", ""),
|
||||
}
|
||||
|
||||
with Path(self.version_file).open("w", encoding="utf-8") as f:
|
||||
json.dump(version_data, f, indent=2)
|
||||
|
||||
logger.info(f"📌 Cached version: {version_data['version']}")
|
||||
|
||||
def check_binary_works(self, binary_path: Path | None = None) -> bool:
|
||||
"""Check if the binary actually works.
|
||||
|
||||
Args:
|
||||
binary_path: Path to binary to check. If None, checks quantize binary.
|
||||
|
||||
Returns:
|
||||
True if binary executes successfully, False otherwise.
|
||||
"""
|
||||
if binary_path is None:
|
||||
binary_path = self.quantize_binary_path
|
||||
|
||||
if not binary_path.exists():
|
||||
return False
|
||||
|
||||
try:
|
||||
# Set LD_LIBRARY_PATH to include binary directory for shared libraries
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
|
||||
result = subprocess.run(
|
||||
[str(binary_path), "--help"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
env=env,
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
else:
|
||||
# llama-quantize returns 1 for --help but shows usage, which means it works
|
||||
return result.returncode in {0, 1} and "usage:" in result.stdout.lower()
|
|
@ -34,7 +34,7 @@ class FilesystemService:
|
|||
size formatting across the toolset.
|
||||
|
||||
Returns:
|
||||
Human-readable file size string (e.g., "1.5G", "750M").
|
||||
Human-readable file size string (e.g. "1.5G", "750M").
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
|
|
|
@ -8,6 +8,9 @@ Uses UK English spelling conventions throughout.
|
|||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import json
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
|
||||
import gguf
|
||||
|
@ -38,8 +41,6 @@ class TensorMapper(Protocol):
|
|||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from helpers.models.conversion import ModelConfig
|
||||
|
@ -77,6 +78,11 @@ class GGUFWriter:
|
|||
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
||||
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
||||
|
||||
# Log architecture being used
|
||||
logger.info(f"Setting GGUF architecture: {self.architecture}")
|
||||
if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
|
||||
logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
|
||||
|
||||
# Model parameters from config
|
||||
params = model_config.to_gguf_params()
|
||||
self.writer.add_context_length(params.context_length)
|
||||
|
@ -122,10 +128,239 @@ class GGUFWriter:
|
|||
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
||||
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
||||
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
||||
self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama"))
|
||||
|
||||
# Add BOS/EOS token addition flags if available
|
||||
if "add_bos_token" in tokeniser_config:
|
||||
self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
|
||||
if "add_eos_token" in tokeniser_config:
|
||||
self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
|
||||
|
||||
# Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type
|
||||
|
||||
logger.info("Added tokeniser configuration")
|
||||
|
||||
def add_tokeniser_vocabulary(self, model_path: Path) -> None:
|
||||
"""Add full tokeniser vocabulary to GGUF file.
|
||||
|
||||
Loads and embeds the complete tokeniser vocabulary including tokens,
|
||||
merges, and scores to enable standalone model usage without external
|
||||
tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
|
||||
"""
|
||||
tokenizer_path = model_path / "tokenizer.json"
|
||||
if not tokenizer_path.exists():
|
||||
logger.warning("tokenizer.json not found, skipping vocabulary embedding")
|
||||
return
|
||||
|
||||
try:
|
||||
with Path(tokenizer_path).open(encoding="utf-8") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
model_data = tokenizer_data.get("model", {})
|
||||
model_type = model_data.get("type", "")
|
||||
|
||||
# Get pre-tokenizer information
|
||||
pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
|
||||
pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
|
||||
|
||||
# Get added tokens
|
||||
added_tokens = tokenizer_data.get("added_tokens", [])
|
||||
|
||||
if model_type == "BPE":
|
||||
self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
|
||||
elif model_type == "Unigram":
|
||||
self._add_unigram_tokenizer(model_data, added_tokens)
|
||||
elif model_type == "WordPiece":
|
||||
self._add_wordpiece_tokenizer(model_data, added_tokens)
|
||||
else:
|
||||
logger.warning(f"Unsupported tokenizer type: {model_type}")
|
||||
# Try to add as generic tokenizer
|
||||
self._add_generic_tokenizer(model_data, tokenizer_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load tokeniser vocabulary: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
|
||||
"""Determine pre-tokenizer type from configuration.
|
||||
|
||||
Returns:
|
||||
Pre-tokenizer type.
|
||||
"""
|
||||
if not pre_tokenizer:
|
||||
return "default"
|
||||
|
||||
# Check for various pre-tokenizer types
|
||||
pre_type = pre_tokenizer.get("type", "")
|
||||
if "ByteLevel" in str(pre_type):
|
||||
return "llama3"
|
||||
if "Metaspace" in str(pre_type):
|
||||
return "default"
|
||||
|
||||
return "default"
|
||||
|
||||
def _add_bpe_tokenizer(
|
||||
self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str
|
||||
) -> None:
|
||||
"""Add BPE tokenizer vocabulary to GGUF."""
|
||||
vocab = model_data.get("vocab", {})
|
||||
merges = model_data.get("merges", [])
|
||||
|
||||
if not vocab:
|
||||
logger.warning("No vocabulary found in BPE tokenizer")
|
||||
return
|
||||
|
||||
# Create token list sorted by index
|
||||
max_idx = max(vocab.values()) if vocab else 0
|
||||
tokens = [""] * (max_idx + 1)
|
||||
|
||||
for token, idx in vocab.items():
|
||||
if 0 <= idx < len(tokens):
|
||||
tokens[idx] = token
|
||||
|
||||
# Handle added tokens
|
||||
for added_token in added_tokens:
|
||||
token_id = added_token.get("id")
|
||||
content = added_token.get("content")
|
||||
if token_id is not None and content is not None:
|
||||
if token_id >= len(tokens):
|
||||
tokens.extend([""] * (token_id - len(tokens) + 1))
|
||||
tokens[token_id] = content
|
||||
|
||||
# Prepare token types
|
||||
token_types = []
|
||||
for i, _token in enumerate(tokens):
|
||||
# Check if it's a special/control token
|
||||
is_special = any(
|
||||
added_token.get("id") == i and added_token.get("special", False)
|
||||
for added_token in added_tokens
|
||||
)
|
||||
if is_special:
|
||||
token_types.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
token_types.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# Add to GGUF
|
||||
self.writer.add_tokenizer_model("gpt2")
|
||||
self.writer.add_tokenizer_pre(pre_type)
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores([0.0] * len(tokens))
|
||||
self.writer.add_token_types(token_types)
|
||||
|
||||
if merges:
|
||||
self.writer.add_token_merges(merges)
|
||||
logger.info(f"Added {len(merges)} BPE merges")
|
||||
|
||||
logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)")
|
||||
|
||||
def _add_unigram_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
added_tokens: list[dict[str, Any]], # noqa: ARG002
|
||||
) -> None:
|
||||
"""Add Unigram/SentencePiece tokenizer to GGUF."""
|
||||
vocab = model_data.get("vocab", [])
|
||||
if not vocab:
|
||||
logger.warning("No vocabulary found in Unigram tokenizer")
|
||||
return
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
token_types = []
|
||||
|
||||
# Process regular vocabulary
|
||||
for item in vocab:
|
||||
if isinstance(item, list) and len(item) >= 2:
|
||||
token = item[0]
|
||||
score = float(item[1]) if len(item) > 1 else 0.0
|
||||
tokens.append(token)
|
||||
scores.append(score)
|
||||
|
||||
# Determine token type
|
||||
if token.startswith("<") and token.endswith(">"):
|
||||
token_types.append(gguf.TokenType.CONTROL)
|
||||
elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
|
||||
token_types.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
token_types.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# Add to GGUF
|
||||
self.writer.add_tokenizer_model("llama")
|
||||
self.writer.add_tokenizer_pre("default")
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores(scores)
|
||||
self.writer.add_token_types(token_types)
|
||||
|
||||
logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)")
|
||||
|
||||
def _add_wordpiece_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
added_tokens: list[dict[str, Any]], # noqa: ARG002
|
||||
) -> None:
|
||||
"""Add WordPiece tokenizer to GGUF."""
|
||||
vocab = model_data.get("vocab", {})
|
||||
if not vocab:
|
||||
logger.warning("No vocabulary found in WordPiece tokenizer")
|
||||
return
|
||||
|
||||
# Create token list sorted by index
|
||||
max_idx = max(vocab.values()) if vocab else 0
|
||||
tokens = [""] * (max_idx + 1)
|
||||
|
||||
for token, idx in vocab.items():
|
||||
if 0 <= idx < len(tokens):
|
||||
tokens[idx] = token
|
||||
|
||||
# Token types (all normal for WordPiece)
|
||||
token_types = [gguf.TokenType.NORMAL] * len(tokens)
|
||||
|
||||
# Add to GGUF
|
||||
self.writer.add_tokenizer_model("bert")
|
||||
self.writer.add_tokenizer_pre("default")
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores([0.0] * len(tokens))
|
||||
self.writer.add_token_types(token_types)
|
||||
|
||||
logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)")
|
||||
|
||||
def _add_generic_tokenizer(
|
||||
self,
|
||||
model_data: dict[str, Any],
|
||||
tokenizer_data: dict[str, Any], # noqa: ARG002
|
||||
) -> None:
|
||||
"""Try to add a generic tokenizer based on available data."""
|
||||
vocab = model_data.get("vocab")
|
||||
if not vocab:
|
||||
logger.warning("Cannot extract vocabulary from unknown tokenizer type")
|
||||
return
|
||||
|
||||
# Try to extract tokens in a generic way
|
||||
tokens = []
|
||||
if isinstance(vocab, dict):
|
||||
# Dictionary-style vocab
|
||||
max_idx = max(vocab.values()) if vocab else 0
|
||||
tokens = [""] * (max_idx + 1)
|
||||
for token, idx in vocab.items():
|
||||
if 0 <= idx < len(tokens):
|
||||
tokens[idx] = token
|
||||
elif isinstance(vocab, list):
|
||||
# List-style vocab
|
||||
for item in vocab:
|
||||
if isinstance(item, str):
|
||||
tokens.append(item)
|
||||
elif isinstance(item, list) and len(item) > 0:
|
||||
tokens.append(item[0])
|
||||
|
||||
if tokens:
|
||||
self.writer.add_tokenizer_model("llama") # Default to llama
|
||||
self.writer.add_tokenizer_pre("default")
|
||||
self.writer.add_token_list(tokens)
|
||||
self.writer.add_token_scores([0.0] * len(tokens))
|
||||
self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens))
|
||||
logger.info(f"Added generic tokeniser ({len(tokens)} tokens)")
|
||||
else:
|
||||
logger.warning("Could not extract tokens from unknown tokenizer format")
|
||||
|
||||
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
||||
"""Add a tensor to the GGUF file.
|
||||
|
||||
|
@ -219,13 +454,20 @@ class GGUFConverter:
|
|||
|
||||
logger.info(f"Total tensors processed: {tensor_count}")
|
||||
|
||||
# Add tokeniser
|
||||
# Add tokeniser configuration
|
||||
try:
|
||||
tok_config = ConfigParser.load_tokeniser_config(model_path)
|
||||
writer_wrapper.add_tokeniser(tok_config)
|
||||
logger.info("Tokeniser added")
|
||||
logger.info("Tokeniser configuration added")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not add tokeniser: {e}")
|
||||
logger.warning(f"Could not add tokeniser configuration: {e}")
|
||||
|
||||
# Add tokeniser vocabulary (critical for standalone usage)
|
||||
try:
|
||||
writer_wrapper.add_tokeniser_vocabulary(model_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to embed tokeniser vocabulary: {e}")
|
||||
logger.error("Model will not work without external tokeniser files!")
|
||||
|
||||
# Finalise file
|
||||
writer_wrapper.finalise()
|
||||
|
|
|
@ -7,6 +7,7 @@ spelling conventions throughout.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
|
@ -17,6 +18,7 @@ from typing import TYPE_CHECKING
|
|||
from helpers.config.quantisation_configs import QUANTISATION_CONFIGS
|
||||
from helpers.logger import logger
|
||||
from helpers.models.quantisation import QuantisationType
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource, QuantisationResult
|
||||
|
@ -260,14 +262,47 @@ class ReadmeGenerator:
|
|||
# Get original README content
|
||||
original_content = self._get_original_readme(model_source, model_dir)
|
||||
|
||||
# Get architecture from config.json
|
||||
architecture = self._get_architecture(model_dir)
|
||||
|
||||
# Generate new README
|
||||
readme_content = self._generate_readme_content(
|
||||
model_source, results, original_content, output_repo
|
||||
model_source, results, original_content, output_repo, architecture, models_dir
|
||||
)
|
||||
|
||||
readme_path.write_text(readme_content)
|
||||
return readme_path
|
||||
|
||||
def _get_architecture(self, model_dir: Path) -> str | None:
|
||||
"""Get the architecture from the model's config.json.
|
||||
|
||||
Returns:
|
||||
Architecture name or None if not found.
|
||||
"""
|
||||
config_path = model_dir / "config.json"
|
||||
if not config_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with config_path.open(encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Get the architectures field - it's a list
|
||||
architectures = config.get("architectures", [])
|
||||
if architectures:
|
||||
arch_name = architectures[0]
|
||||
|
||||
# Get the mapped architecture (what it will be converted to)
|
||||
parser = ConfigParser()
|
||||
mapped_arch = parser.get_architecture_mapping(arch_name)
|
||||
|
||||
logger.info(f"Architecture: {arch_name} -> {mapped_arch}")
|
||||
return mapped_arch
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not determine architecture: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]:
|
||||
"""Extract original README and metadata.
|
||||
|
||||
|
@ -427,6 +462,8 @@ class ReadmeGenerator:
|
|||
results: dict[QuantisationType, QuantisationResult],
|
||||
original_content: dict[str, str],
|
||||
output_repo: str | None = None,
|
||||
architecture: str | None = None,
|
||||
models_dir: Path | None = None,
|
||||
) -> str:
|
||||
"""Generate complete README content with quantisation details.
|
||||
|
||||
|
@ -436,22 +473,27 @@ class ReadmeGenerator:
|
|||
Returns:
|
||||
Complete README markdown content.
|
||||
"""
|
||||
# Build tags
|
||||
our_tags = [
|
||||
"quantised",
|
||||
"gguf",
|
||||
"q3_k_m",
|
||||
"q3_k_l",
|
||||
"q3_k_xl",
|
||||
"q4_k_m",
|
||||
"q4_k_l",
|
||||
"q5_k_m",
|
||||
"q5_k_l",
|
||||
"q6_k",
|
||||
"q6_k_l",
|
||||
"q8_0",
|
||||
"bartowski-method",
|
||||
]
|
||||
# Build tags based on actual successful quantisations
|
||||
our_tags = ["gguf"]
|
||||
|
||||
# Add tags for successful quantisations only
|
||||
for quant_type, result in results.items():
|
||||
if hasattr(result, "status") and result.status == "completed":
|
||||
if quant_type == "F16":
|
||||
our_tags.append("f16")
|
||||
elif hasattr(result, "quantisation_type"):
|
||||
# Convert to lowercase tag format (e.g., Q3_K_M -> q3_k_m)
|
||||
our_tags.append(result.quantisation_type.value.lower())
|
||||
|
||||
# If no quantisations succeeded but F16 is available, still add basic tags
|
||||
if (
|
||||
len(our_tags) == 1
|
||||
and "F16" in results
|
||||
and hasattr(results["F16"], "status")
|
||||
and results["F16"].status in {"completed", "uploading"}
|
||||
):
|
||||
our_tags.append("f16")
|
||||
|
||||
original_tags = original_content["tags"].split(",") if original_content["tags"] else []
|
||||
all_tags = sorted(set(our_tags + original_tags))
|
||||
|
||||
|
@ -476,8 +518,8 @@ GGUF quantisations of [{model_source.source_model}]({hf_url}) using
|
|||
[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools)
|
||||
which replicates Bartowski's quantisation profiles.
|
||||
|
||||
| Variant | Configuration | File Size | Status |
|
||||
|---|---|---|---|
|
||||
| Variant | Configuration | Status |
|
||||
|---|---|---|
|
||||
"""
|
||||
|
||||
# Add results table - group by layer config patterns
|
||||
|
@ -500,24 +542,91 @@ which replicates Bartowski's quantisation profiles.
|
|||
result = type("Result", (), {"status": "planned", "success": False})()
|
||||
|
||||
config = QUANTISATION_CONFIGS.get(quant_type)
|
||||
file_size = self._format_file_size(result)
|
||||
status = self._format_status(result, model_source, quant_type, output_repo)
|
||||
|
||||
# Get configuration description from the config itself
|
||||
config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers"
|
||||
config_desc = (
|
||||
config.get_compact_config(QUANTISATION_CONFIGS)
|
||||
if config
|
||||
else f"{quant_type} all layers"
|
||||
)
|
||||
|
||||
content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n"
|
||||
content += f"| **{quant_type.value}** | {config_desc} | {status} |\n"
|
||||
|
||||
# Add F16 row at the bottom if we converted from SafeTensors
|
||||
# Note: Named "f16" for compatibility, but contains mixed F16/F32 tensors
|
||||
# (BF16 source tensors are converted to F32 to preserve precision)
|
||||
if not model_source.is_gguf_repo and output_repo:
|
||||
f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}"
|
||||
|
||||
# Get F16 result from results dict (if tracking it)
|
||||
f16_result = results.get("F16")
|
||||
|
||||
# Get file size
|
||||
f16_size = "-"
|
||||
if f16_result and hasattr(f16_result, "file_size"):
|
||||
f16_size = f16_result.file_size
|
||||
elif models_dir:
|
||||
# Try to get from actual file
|
||||
f16_path = models_dir / model_source.model_name / f16_filename
|
||||
if f16_path.exists():
|
||||
size_bytes = f16_path.stat().st_size
|
||||
size_gb = size_bytes / GIBIBYTE
|
||||
f16_size = f"{size_gb:.1f}GB"
|
||||
|
||||
# Format status based on upload state
|
||||
if f16_result and hasattr(f16_result, "status"):
|
||||
if f16_result.status == "uploading":
|
||||
f16_status = f"⬆️ Uploading... ({f16_size})"
|
||||
elif f16_result.status == "completed":
|
||||
f16_status = f"[✅ {f16_size}]({f16_url})"
|
||||
else:
|
||||
f16_status = "⏳ Queued"
|
||||
else:
|
||||
# Default to available if no status tracking
|
||||
f16_status = f"[✅ {f16_size}]({f16_url})"
|
||||
|
||||
content += f"| **F16** | Full precision GGUF (F16/F32 mixed) | {f16_status} |\n"
|
||||
|
||||
content += """
|
||||
|
||||
**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN
|
||||
|
||||
See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md)
|
||||
for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/)
|
||||
for more on the tools and methods I use.
|
||||
|
||||
"""
|
||||
|
||||
# Add warning for unsupported architectures
|
||||
if architecture:
|
||||
supported_archs = {
|
||||
"llama",
|
||||
"qwen2",
|
||||
"gemma",
|
||||
"phi3",
|
||||
"falcon",
|
||||
"gpt2",
|
||||
"gptj",
|
||||
"gptneox",
|
||||
"mpt",
|
||||
"baichuan",
|
||||
"stablelm",
|
||||
}
|
||||
if architecture not in supported_archs:
|
||||
content += (
|
||||
f"⚠️ **Note:** This model uses the `{architecture}` architecture, which is not "
|
||||
"yet supported by llama.cpp for quantisation. If quantisations failed, this is "
|
||||
"why - llama.cpp cannot quantise architectures it doesn't recognise. The F16 "
|
||||
"GGUF file is provided as a full-precision fallback (requires ~2x model size "
|
||||
f"in VRAM). For `{architecture}` support, check with your inference software "
|
||||
"or wait for llama.cpp updates.\n\n"
|
||||
)
|
||||
|
||||
content += (
|
||||
"See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/"
|
||||
"bartowski_analysis.md) for detailed quantisation strategies and "
|
||||
"[Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) "
|
||||
"for more on the tools and methods I use.\n\n"
|
||||
)
|
||||
|
||||
# Add original content
|
||||
if original_content["readme"]:
|
||||
content += "## Original Model Card\n\n---\n\n" + original_content["readme"]
|
||||
|
@ -570,6 +679,15 @@ for more on the tools and methods I use.
|
|||
if hasattr(result, "status") and result.status in status_map:
|
||||
base_status = status_map[result.status]
|
||||
|
||||
# Check for architecture not supported error
|
||||
if (
|
||||
result.status == "failed"
|
||||
and hasattr(result, "error_message")
|
||||
and result.error_message
|
||||
and "architecture not supported" in str(result.error_message).lower()
|
||||
):
|
||||
return "⚠️ Skipped"
|
||||
|
||||
if result.status == "uploading" and hasattr(result, "file_size") and result.file_size:
|
||||
return f"{base_status} ({result.file_size})"
|
||||
if result.status == "completed" or (hasattr(result, "success") and result.success):
|
||||
|
|
258
helpers/services/imatrix_generator.py
Normal file
258
helpers/services/imatrix_generator.py
Normal file
|
@ -0,0 +1,258 @@
|
|||
"""Importance matrix generation service.
|
||||
|
||||
Generates importance matrices using llama-imatrix binary with calibration
|
||||
data for improved quantisation quality.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.binary_manager import BinaryManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from helpers.models.quantisation import ModelSource
|
||||
|
||||
|
||||
class IMatrixGenerator:
|
||||
"""Generates importance matrices for quantisation guidance.
|
||||
|
||||
Uses llama-imatrix binary to compute importance matrices from
|
||||
calibration data, which helps preserve model quality during
|
||||
quantisation by identifying critical weights.
|
||||
"""
|
||||
|
||||
# Default calibration data location
|
||||
CALIBRATION_DATA = Path("resources") / "imatrix_data.txt"
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise imatrix generator."""
|
||||
self.binary_manager = BinaryManager()
|
||||
self.imatrix_binary = self._get_imatrix_binary()
|
||||
|
||||
def _get_imatrix_binary(self) -> Path | None:
|
||||
"""Get llama-imatrix binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if found, None otherwise.
|
||||
"""
|
||||
# First check local directory for manual placement
|
||||
local_binary = Path("./llama-imatrix")
|
||||
if local_binary.exists():
|
||||
logger.info(f"Using local llama-imatrix binary: {local_binary}")
|
||||
return local_binary
|
||||
|
||||
# Download from GitHub releases
|
||||
binary_path = self.binary_manager.get_imatrix_binary()
|
||||
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
||||
logger.info(f"Using llama-imatrix binary: {binary_path}")
|
||||
return binary_path
|
||||
|
||||
logger.warning("llama-imatrix binary not available")
|
||||
return None
|
||||
|
||||
def can_generate(self) -> bool:
|
||||
"""Check if imatrix generation is available.
|
||||
|
||||
Returns:
|
||||
True if binary and calibration data are available.
|
||||
"""
|
||||
return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists()
|
||||
|
||||
def generate_imatrix(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
output_path: Path,
|
||||
calibration_data: Path | None = None,
|
||||
) -> bool:
|
||||
"""Generate importance matrix for a model.
|
||||
|
||||
Returns:
|
||||
True if generation successful, False otherwise.
|
||||
"""
|
||||
validation_error = self._validate_generation_inputs(f16_model_path, calibration_data)
|
||||
if validation_error:
|
||||
logger.error(validation_error)
|
||||
return False
|
||||
|
||||
cal_data = calibration_data or self.CALIBRATION_DATA
|
||||
cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path)
|
||||
|
||||
self._log_generation_start(f16_model_path, cal_data, output_path)
|
||||
|
||||
return self._execute_imatrix_generation(cmd, output_path)
|
||||
|
||||
def _validate_generation_inputs(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
calibration_data: Path | None,
|
||||
) -> str | None:
|
||||
"""Validate inputs for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Error message if validation fails, None if valid.
|
||||
"""
|
||||
if not self.imatrix_binary:
|
||||
return "llama-imatrix binary not available"
|
||||
|
||||
if not f16_model_path.exists():
|
||||
return f"Model file not found: {f16_model_path}"
|
||||
|
||||
cal_data = calibration_data or self.CALIBRATION_DATA
|
||||
if not cal_data.exists():
|
||||
return f"Calibration data not found: {cal_data}"
|
||||
|
||||
return None
|
||||
|
||||
def _build_imatrix_command(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
cal_data: Path,
|
||||
output_path: Path,
|
||||
) -> list[str]:
|
||||
"""Build command for imatrix generation.
|
||||
|
||||
Returns:
|
||||
Command list ready for subprocess execution.
|
||||
"""
|
||||
return [
|
||||
str(self.imatrix_binary),
|
||||
"-m",
|
||||
str(f16_model_path),
|
||||
"-f",
|
||||
str(cal_data),
|
||||
"-o",
|
||||
str(output_path),
|
||||
"--chunks",
|
||||
"128", # Process in chunks for stability
|
||||
]
|
||||
|
||||
def _log_generation_start(
|
||||
self,
|
||||
f16_model_path: Path,
|
||||
cal_data: Path,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Log the start of imatrix generation."""
|
||||
logger.info("🧮 Generating importance matrix...")
|
||||
logger.info(f"📊 Model: {f16_model_path.name}")
|
||||
logger.info(f"📝 Calibration data: {cal_data.name}")
|
||||
logger.info(f"💾 Output: {output_path.name}")
|
||||
|
||||
def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool:
|
||||
"""Execute the imatrix generation process.
|
||||
|
||||
Returns:
|
||||
True if generation completed successfully, False otherwise.
|
||||
"""
|
||||
# Set LD_LIBRARY_PATH for shared libraries
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.binary_manager.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
|
||||
self._stream_process_output(process)
|
||||
return self._handle_process_completion(process, output_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Imatrix generation failed: {e}")
|
||||
return False
|
||||
|
||||
def _stream_process_output(self, process: subprocess.Popen[str]) -> None:
|
||||
"""Stream output from the running process."""
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
# Filter progress updates for cleaner output
|
||||
line = output.strip()
|
||||
if line and not line.startswith("["):
|
||||
logger.info(f" {line}")
|
||||
|
||||
def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool:
|
||||
"""Handle completion of the imatrix generation process.
|
||||
|
||||
Returns:
|
||||
True if process completed successfully and output exists, False otherwise.
|
||||
"""
|
||||
return_code = process.poll()
|
||||
if return_code != 0:
|
||||
logger.error(f"❌ Imatrix generation failed with return code {return_code}")
|
||||
return False
|
||||
|
||||
if not output_path.exists():
|
||||
logger.error("Generation completed but output file not found")
|
||||
return False
|
||||
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
logger.info(f"✅ Generated imatrix: {output_path.name} ({size_mb:.1f} MB)")
|
||||
return True
|
||||
|
||||
def prompt_for_generation(
|
||||
self,
|
||||
model_source: ModelSource,
|
||||
model_dir: Path,
|
||||
f16_model_path: Path,
|
||||
) -> Path | None:
|
||||
"""Prompt user to generate imatrix.
|
||||
|
||||
Args:
|
||||
model_source: Model source information.
|
||||
model_dir: Model directory.
|
||||
f16_model_path: Path to F16 model.
|
||||
|
||||
Returns:
|
||||
Path to generated imatrix or None if skipped.
|
||||
"""
|
||||
if not self.can_generate():
|
||||
logger.info("⚠️ Imatrix generation not available (missing binary or calibration data)")
|
||||
return None
|
||||
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("📊 Importance Matrix Generation")
|
||||
logger.info("=" * 70)
|
||||
logger.info(
|
||||
"\nImportance matrices improve quantisation quality by identifying"
|
||||
"\ncritical weights in the model. This process takes 5-10 minutes"
|
||||
"\nbut significantly improves the quality of smaller quantisations."
|
||||
)
|
||||
logger.info(f"\nModel: {model_source.model_name}")
|
||||
logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}")
|
||||
|
||||
response = input("\n❓ Generate importance matrix? (Y/n): ").strip().lower()
|
||||
|
||||
if response == "n":
|
||||
logger.info("Skipping imatrix generation")
|
||||
return None
|
||||
|
||||
# Generate imatrix
|
||||
output_path = model_dir / "imatrix.dat"
|
||||
logger.info("\n⏳ Generating importance matrix (this may take 5-10 minutes)...")
|
||||
|
||||
if self.generate_imatrix(f16_model_path, output_path):
|
||||
return output_path
|
||||
|
||||
logger.warning("Failed to generate imatrix, continuing without it")
|
||||
return None
|
|
@ -1,82 +1,294 @@
|
|||
"""Importance matrix (imatrix) management service.
|
||||
"""Direct llama.cpp binary execution service.
|
||||
|
||||
Manages detection and use of existing importance matrix files for
|
||||
quantisation guidance. Provides user prompts for supplying pre-computed
|
||||
imatrix files from external sources.
|
||||
Provides direct execution of llama.cpp quantisation binary with proper
|
||||
tensor-specific override support for L and XL variants.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from helpers.logger import logger
|
||||
from helpers.services.binary_manager import BinaryManager
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
from helpers.models.quantisation import QuantisationConfig
|
||||
|
||||
|
||||
class IMatrixManager:
|
||||
"""Handles importance matrix file management for quantisation.
|
||||
class QuantisationExecutor:
|
||||
"""Executes llama.cpp quantisation with tensor overrides.
|
||||
|
||||
Locates existing importance matrix files or prompts users to provide
|
||||
pre-computed matrices from external sources. These matrices guide
|
||||
quantisation decisions to preserve model quality.
|
||||
Provides direct binary execution with proper command-line flags for
|
||||
tensor-specific overrides, supporting Bartowski-style L and XL variants.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixManager."""
|
||||
"""Initialise quantisation executor."""
|
||||
self.fs = FilesystemService()
|
||||
self.binary_manager = BinaryManager()
|
||||
self.quantise_binary = self._get_quantise_binary()
|
||||
self.last_error: str | None = None # Track last error type
|
||||
|
||||
def _get_quantise_binary(self) -> Path | None:
|
||||
"""Get llama-quantize binary, downloading if necessary.
|
||||
|
||||
Returns:
|
||||
Path to binary if found, None otherwise.
|
||||
"""
|
||||
# First check local directory for manual placement
|
||||
local_binary = Path("./llama-quantize")
|
||||
if local_binary.exists():
|
||||
logger.info(f"Using local llama-quantize binary: {local_binary}")
|
||||
return local_binary
|
||||
|
||||
# Download from GitHub releases
|
||||
binary_path = self.binary_manager.get_quantise_binary()
|
||||
if binary_path and self.binary_manager.check_binary_works(binary_path):
|
||||
logger.info(f"Using llama-quantize binary: {binary_path}")
|
||||
return binary_path
|
||||
|
||||
logger.error("Failed to obtain llama-quantize binary")
|
||||
logger.info(
|
||||
"You can manually download it from: https://github.com/ggml-org/llama.cpp/releases"
|
||||
)
|
||||
return None
|
||||
|
||||
def execute_quantisation(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None = None,
|
||||
) -> bool:
|
||||
"""Execute quantisation using llama.cpp binary.
|
||||
|
||||
Builds and executes llama-quantize command with proper tensor override
|
||||
flags for L and XL variants.
|
||||
|
||||
Returns:
|
||||
True if quantisation successful, False otherwise.
|
||||
"""
|
||||
if not self.quantise_binary:
|
||||
logger.error("llama-quantize binary not available")
|
||||
return False
|
||||
|
||||
# Build command
|
||||
cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path)
|
||||
|
||||
# Execute with real-time output
|
||||
return self._execute_command(cmd)
|
||||
|
||||
def _build_quantisation_command(
|
||||
self,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
config: QuantisationConfig,
|
||||
imatrix_path: Path | None,
|
||||
) -> list[str]:
|
||||
"""Build llama-quantize command with tensor overrides.
|
||||
|
||||
Returns:
|
||||
Command arguments as list.
|
||||
"""
|
||||
cmd = [str(self.quantise_binary)]
|
||||
|
||||
# Add imatrix if available
|
||||
if imatrix_path:
|
||||
cmd.extend(["--imatrix", str(imatrix_path)])
|
||||
if imatrix_path.exists():
|
||||
logger.info(f"🧮 Using imatrix: {imatrix_path.name}")
|
||||
|
||||
# Add tensor-specific overrides for L and XL variants
|
||||
if config.embedding_type:
|
||||
# Use directly from config - already in correct format
|
||||
cmd.extend(["--token-embedding-type", config.embedding_type.lower()])
|
||||
logger.info(f"⚙️ Token embedding type: {config.embedding_type}")
|
||||
|
||||
if config.output_type:
|
||||
# Use directly from config - already in correct format
|
||||
cmd.extend(["--output-tensor-type", config.output_type.lower()])
|
||||
logger.info(f"⚙️ Output tensor type: {config.output_type}")
|
||||
|
||||
# Note: Per-layer tensor overrides could be added here if needed in future
|
||||
# For now, embedding and output overrides handle the L/XL variants
|
||||
|
||||
# Get base quantisation type
|
||||
base_quant = self._get_base_quantisation_type(config.name)
|
||||
|
||||
# Add input, output, and base quantisation type
|
||||
cmd.extend([str(input_path), str(output_path), base_quant])
|
||||
|
||||
return cmd
|
||||
|
||||
def _get_base_quantisation_type(self, config_name: str) -> str:
|
||||
"""Get base quantisation type for a config.
|
||||
|
||||
Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M).
|
||||
|
||||
Returns:
|
||||
Base quantisation type string.
|
||||
"""
|
||||
# Mapping of custom variants to base types
|
||||
variant_mapping = {
|
||||
"Q3_K_L": "Q3_K_M",
|
||||
"Q3_K_XL": "Q3_K_M",
|
||||
"Q4_K_L": "Q4_K_M",
|
||||
"Q4_K_XL": "Q4_K_M",
|
||||
"Q5_K_L": "Q5_K_M",
|
||||
"Q5_K_XL": "Q5_K_M",
|
||||
"Q6_K_L": "Q6_K",
|
||||
"Q6_K_XL": "Q6_K",
|
||||
}
|
||||
|
||||
return variant_mapping.get(config_name, config_name)
|
||||
|
||||
def _execute_command(self, cmd: list[str]) -> bool:
|
||||
"""Execute command with real-time output streaming.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise.
|
||||
"""
|
||||
logger.info(f"💻 Running: {' '.join(cmd)}")
|
||||
logger.info("⏳ Quantisation in progress... (this may take several minutes)")
|
||||
|
||||
# Set LD_LIBRARY_PATH for shared libraries
|
||||
env = os.environ.copy()
|
||||
if platform.system() != "Windows":
|
||||
lib_path = str(self.binary_manager.BINARY_DIR)
|
||||
if "LD_LIBRARY_PATH" in env:
|
||||
env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}"
|
||||
else:
|
||||
env["LD_LIBRARY_PATH"] = lib_path
|
||||
|
||||
# Track output for architecture detection
|
||||
output_lines = []
|
||||
architecture_error = False
|
||||
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Stream output
|
||||
while True:
|
||||
if process.stdout is not None:
|
||||
output = process.stdout.readline()
|
||||
else:
|
||||
break
|
||||
if not output and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
output_stripped = output.strip()
|
||||
logger.info(f"📊 {output_stripped}")
|
||||
output_lines.append(output_stripped)
|
||||
|
||||
# Check for architecture-related errors
|
||||
if any(
|
||||
phrase in output_stripped.lower()
|
||||
for phrase in [
|
||||
"unsupported architecture",
|
||||
"unknown architecture",
|
||||
"architecture not supported",
|
||||
"model architecture",
|
||||
"llama_model_load: error loading model",
|
||||
]
|
||||
):
|
||||
architecture_error = True
|
||||
|
||||
return_code = process.poll()
|
||||
if return_code == 0:
|
||||
logger.info("✅ Quantisation successful!")
|
||||
return True
|
||||
|
||||
# Check if this was an architecture error
|
||||
if architecture_error or return_code == 1:
|
||||
# Look for architecture info in recent output
|
||||
for line in output_lines[-10:]: # Check last 10 lines
|
||||
if "architecture" in line.lower():
|
||||
logger.error("❌ Architecture not supported by llama.cpp")
|
||||
logger.error(" so cannot be quantised with current llama.cpp but")
|
||||
logger.error(" F16 GGUF file can be used for inference if supported")
|
||||
# Store this for the orchestrator to detect
|
||||
self.last_error = "unsupported_architecture"
|
||||
return False
|
||||
|
||||
logger.error(f"❌ Quantisation failed with return code {return_code}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Quantisation failed with exception: {e}")
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class IMatrixHandler:
|
||||
"""Handles importance matrix file management.
|
||||
|
||||
Manages detection and use of existing importance matrix files for
|
||||
quantisation guidance.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise IMatrixHandler."""
|
||||
self.fs = FilesystemService()
|
||||
|
||||
def find_imatrix(self, model_dir: Path) -> Path | None:
|
||||
"""Find or prompt for importance matrix file.
|
||||
|
||||
Searches for existing imatrix files first, then provides interactive
|
||||
prompts for user-supplied matrices. See docs/imatrix_data.md for
|
||||
instructions on generating imatrix files.
|
||||
"""Find existing imatrix file in model directory.
|
||||
|
||||
Returns:
|
||||
Path to imatrix file, or None if not available.
|
||||
Path to imatrix file if found, None otherwise.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
# Check for existing imatrix
|
||||
if imatrix_path.exists():
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name}")
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
# Try user-provided imatrix
|
||||
return self._prompt_for_user_imatrix(model_dir, imatrix_path)
|
||||
return None
|
||||
|
||||
def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None:
|
||||
def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None:
|
||||
"""Prompt user for existing imatrix file.
|
||||
|
||||
Returns:
|
||||
Path to user-provided imatrix, or None if not available.
|
||||
"""
|
||||
imatrix_path = model_dir / "imatrix.dat"
|
||||
|
||||
logger.info(f"Model directory: {model_dir}")
|
||||
logger.info(f"Looking for imatrix file at: {imatrix_path}")
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("📊 No existing imatrix file found")
|
||||
logger.info("\nYou have two options:")
|
||||
logger.info(" 1. Provide a pre-computed imatrix file")
|
||||
logger.info(" (💡 see docs/imatrix_data.md to generate your own)")
|
||||
logger.info(" 2. Skip imatrix usage (lower quality quantisation)")
|
||||
logger.info("=" * 70)
|
||||
logger.info(
|
||||
"Tip: You can download pre-computed imatrix files from Bartowski's repositories!"
|
||||
)
|
||||
logger.info(
|
||||
" Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix"
|
||||
)
|
||||
|
||||
response = input("\n❓ Do you have an imatrix file to provide? (y/N): ").strip().lower()
|
||||
response = (
|
||||
input("\n❓ Do you have an imatrix file to place in the model directory? (y/N): ")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
|
||||
if response != "y":
|
||||
logger.info("Continuing without imatrix (quantisation quality may be lower)")
|
||||
logger.info("ℹ️ See docs/imatrix_data.md for instructions on generating imatrix files") # noqa: RUF001
|
||||
return None
|
||||
|
||||
logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the file (or Ctrl+C to cancel)...")
|
||||
logger.info(f"Please place your imatrix.dat file in: {model_dir}")
|
||||
input("⏳ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...")
|
||||
|
||||
if imatrix_path.exists():
|
||||
file_size = self.fs.get_file_size(imatrix_path)
|
||||
logger.info(f"✅ Found imatrix file! ({file_size})")
|
||||
logger.info(f"Found imatrix file! ({file_size})")
|
||||
return imatrix_path
|
||||
|
||||
logger.warning("No imatrix.dat file found - continuing without imatrix")
|
||||
|
|
|
@ -86,8 +86,8 @@ class LlamaCppPythonAPI:
|
|||
raise RuntimeError(msg)
|
||||
|
||||
# Normalise the config name to extract base type
|
||||
# E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
|
||||
# E.g., "Q4_K_M_XXL" -> "Q4_K_M"
|
||||
# e.g. "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K)
|
||||
# e.g. "Q4_K_M_XXL" -> "Q4_K_M"
|
||||
config_upper = config_name.upper()
|
||||
|
||||
# Direct mapping for exact matches
|
||||
|
@ -224,7 +224,7 @@ class LlamaCppPythonAPI:
|
|||
Args:
|
||||
input_path: Path to input GGUF model.
|
||||
output_path: Path for output quantised model.
|
||||
base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K").
|
||||
base_type: Base quantisation type (e.g. "Q4_K_M", "Q6_K").
|
||||
embedding_type: Override for token embeddings (None = use base).
|
||||
output_type: Override for output/lm_head layers (None = use base).
|
||||
imatrix_path: Optional importance matrix file.
|
||||
|
@ -470,7 +470,7 @@ class LlamaCppPythonAPI:
|
|||
"""Log current resource usage state.
|
||||
|
||||
Args:
|
||||
phase: Description of current phase (e.g., "before", "after").
|
||||
phase: Description of current phase (e.g. "before", "after").
|
||||
|
||||
Returns:
|
||||
Current memory usage in GB.
|
||||
|
|
|
@ -31,12 +31,14 @@ from helpers.models.quantisation import (
|
|||
QuantisationType,
|
||||
)
|
||||
from helpers.services.huggingface import ReadmeGenerator
|
||||
from helpers.services.llama_cpp import IMatrixManager
|
||||
from helpers.services.imatrix_generator import IMatrixGenerator
|
||||
from helpers.services.llama_cpp import IMatrixHandler
|
||||
from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine
|
||||
from helpers.utils.tensor_mapping import URLParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import FrameType
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
|
@ -55,7 +57,8 @@ class QuantisationOrchestrator:
|
|||
# Service dependencies with factory defaults
|
||||
url_parser: URLParser = field(default_factory=URLParser)
|
||||
quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine)
|
||||
imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager)
|
||||
imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler)
|
||||
imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator)
|
||||
readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator)
|
||||
uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader)
|
||||
|
||||
|
@ -172,18 +175,28 @@ class QuantisationOrchestrator:
|
|||
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||||
f16_model_path = self.model_manager.prepare_model(model_source)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Checking for importance matrix (imatrix)...")
|
||||
imatrix_path = self.imatrix_manager.find_imatrix(
|
||||
self.models_dir / model_source.model_name
|
||||
)
|
||||
|
||||
output_repo = (
|
||||
f"{self.uploader.get_username()}/"
|
||||
f"{model_source.original_author}-{model_source.model_name}-GGUF"
|
||||
)
|
||||
|
||||
imatrix_path = None
|
||||
if self.use_imatrix:
|
||||
logger.info("Checking for importance matrix (imatrix)...")
|
||||
model_dir = self.models_dir / model_source.model_name
|
||||
imatrix_path = self.imatrix_handler.find_imatrix(model_dir)
|
||||
|
||||
# If no imatrix found, offer to generate or provide one
|
||||
if not imatrix_path:
|
||||
# First offer to generate
|
||||
imatrix_path = self.imatrix_generator.prompt_for_generation(
|
||||
model_source, model_dir, f16_model_path
|
||||
)
|
||||
|
||||
# If generation was skipped, offer to provide existing one
|
||||
if not imatrix_path:
|
||||
imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir)
|
||||
|
||||
return model_source, f16_model_path, imatrix_path, output_repo
|
||||
|
||||
def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None:
|
||||
|
@ -222,10 +235,63 @@ class QuantisationOrchestrator:
|
|||
types_list = [qt.value for qt in quantisation_types]
|
||||
logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}")
|
||||
|
||||
# Track F16 in results for status display (if we converted from SafeTensors)
|
||||
if not model_source.is_gguf_repo:
|
||||
# Get F16 file size
|
||||
f16_size = "-"
|
||||
if f16_model_path.exists():
|
||||
size_bytes = f16_model_path.stat().st_size
|
||||
size_gb = size_bytes / (1024**3)
|
||||
f16_size = f"{size_gb:.1f}GB"
|
||||
|
||||
# Create a simple object for F16 tracking (not a QuantisationResult)
|
||||
# since F16 isn't a quantisation type in our enum
|
||||
f16_result = type(
|
||||
"F16Result",
|
||||
(),
|
||||
{
|
||||
"quantisation_type": "F16",
|
||||
"success": True,
|
||||
"status": "planned",
|
||||
"file_path": f16_model_path,
|
||||
"file_size": f16_size,
|
||||
},
|
||||
)()
|
||||
results["F16"] = f16_result
|
||||
|
||||
# Process with parallel uploads - quantise sequentially but upload in background
|
||||
upload_futures = []
|
||||
upload_futures: list[Any] = []
|
||||
architecture_unsupported = False
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor:
|
||||
# Start F16 upload first if we have one
|
||||
if not model_source.is_gguf_repo and not self.no_upload and "F16" in results:
|
||||
f16_result = results["F16"]
|
||||
if f16_result.file_path and f16_result.file_path.exists():
|
||||
logger.info("Starting parallel upload of F16 GGUF...")
|
||||
f16_result.status = "uploading"
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
upload_future = upload_executor.submit(
|
||||
self._upload_f16_and_cleanup,
|
||||
output_repo,
|
||||
f16_result.file_path,
|
||||
model_source,
|
||||
results,
|
||||
)
|
||||
upload_futures.append(upload_future)
|
||||
for i, quant_type in enumerate(quantisation_types, 1):
|
||||
# Skip remaining quantisations if architecture is unsupported
|
||||
if architecture_unsupported:
|
||||
logger.info(f"Skipping {quant_type.value} - architecture not supported")
|
||||
results[quant_type] = QuantisationResult(
|
||||
quantisation_type=quant_type,
|
||||
success=False,
|
||||
status="failed",
|
||||
error_message="Architecture not supported by llama.cpp",
|
||||
)
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}"
|
||||
)
|
||||
|
@ -247,6 +313,30 @@ class QuantisationOrchestrator:
|
|||
results[quant_type] = result
|
||||
logger.debug(f"DEBUG: Quantisation {quant_type.value} completed")
|
||||
|
||||
# Check if this failed due to unsupported architecture
|
||||
if (
|
||||
not result.success
|
||||
and hasattr(self.quantisation_engine.executor, "last_error")
|
||||
and self.quantisation_engine.executor.last_error
|
||||
== "unsupported_architecture"
|
||||
):
|
||||
logger.warning(
|
||||
"Architecture not supported - skipping remaining quantisations"
|
||||
)
|
||||
architecture_unsupported = True
|
||||
# Update the current result to also show as skipped
|
||||
result.error_message = "Architecture not supported by llama.cpp"
|
||||
# Update README immediately to show remaining quantizations as skipped
|
||||
for remaining_quant_type in quantisation_types[i:]:
|
||||
if remaining_quant_type not in results:
|
||||
results[remaining_quant_type] = QuantisationResult(
|
||||
quantisation_type=remaining_quant_type,
|
||||
success=False,
|
||||
status="failed",
|
||||
error_message="Architecture not supported by llama.cpp",
|
||||
)
|
||||
self._update_readme_status(model_source, results, output_repo)
|
||||
|
||||
# Force cleanup between quantisations
|
||||
gc.collect()
|
||||
logger.debug("DEBUG: Garbage collection completed")
|
||||
|
@ -269,6 +359,14 @@ class QuantisationOrchestrator:
|
|||
# Wait for all uploads to complete before returning
|
||||
self._wait_for_uploads(upload_futures)
|
||||
|
||||
# Final README update to ensure all statuses are accurate
|
||||
if not self.no_upload and upload_futures:
|
||||
logger.info("Updating README with final status...")
|
||||
final_readme = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, final_readme)
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_quantisation(
|
||||
|
@ -505,12 +603,26 @@ class QuantisationOrchestrator:
|
|||
|
||||
def _wait_for_uploads(self, upload_futures: list) -> None:
|
||||
"""Wait for all parallel uploads to complete."""
|
||||
logger.info("Waiting for any remaining uploads to complete...")
|
||||
if not upload_futures:
|
||||
return
|
||||
|
||||
logger.info(f"Waiting for {len(upload_futures)} uploads to complete...")
|
||||
completed = 0
|
||||
failed = 0
|
||||
|
||||
for future in upload_futures:
|
||||
try:
|
||||
future.result(timeout=300) # 5 minute timeout per upload
|
||||
completed += 1
|
||||
logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed")
|
||||
except Exception as e:
|
||||
logger.warning(f"Upload error: {e}")
|
||||
failed += 1
|
||||
logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}")
|
||||
|
||||
if failed > 0:
|
||||
logger.warning(f"Upload summary: {completed} succeeded, {failed} failed")
|
||||
else:
|
||||
logger.info(f"All {completed} uploads completed successfully")
|
||||
|
||||
def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None:
|
||||
"""Clean up temporary files after processing."""
|
||||
|
@ -573,6 +685,45 @@ class QuantisationOrchestrator:
|
|||
)
|
||||
# Don't re-raise - let other uploads continue
|
||||
|
||||
def _upload_f16_and_cleanup(
|
||||
self,
|
||||
output_repo: str,
|
||||
file_path: Path,
|
||||
model_source: ModelSource,
|
||||
results: dict[str, QuantisationResult],
|
||||
) -> None:
|
||||
"""Upload F16 file and clean up (runs in background thread)."""
|
||||
try:
|
||||
logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})")
|
||||
self.uploader.upload_model_file(output_repo, file_path)
|
||||
logger.info("[PARALLEL] Upload of F16 GGUF completed successfully")
|
||||
|
||||
# Don't delete F16 yet - we still need it for quantisations
|
||||
# It will be deleted in _cleanup_files after all quantisations complete
|
||||
|
||||
results["F16"].status = "completed"
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
|
||||
logger.info("[PARALLEL] F16 upload complete")
|
||||
except Exception as e:
|
||||
logger.error(f"[PARALLEL] Failed to upload F16: {e}")
|
||||
results["F16"].status = "failed"
|
||||
results["F16"].error_message = str(e)
|
||||
|
||||
try:
|
||||
updated_readme_path = self.readme_generator.generate(
|
||||
model_source, results, self.models_dir, output_repo
|
||||
)
|
||||
self.uploader.upload_readme(output_repo, updated_readme_path)
|
||||
except Exception as readme_error:
|
||||
logger.error(
|
||||
f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}"
|
||||
)
|
||||
# Don't re-raise - let other uploads continue
|
||||
|
||||
def _print_model_info(self, model_source: ModelSource) -> None:
|
||||
"""Print model information."""
|
||||
logger.info(f"Source URL: {model_source.url}")
|
||||
|
|
|
@ -22,7 +22,7 @@ from helpers.models.quantisation import (
|
|||
)
|
||||
from helpers.services.filesystem import FilesystemService
|
||||
from helpers.services.gguf import GGUFConverter
|
||||
from helpers.services.llama_python import LlamaCppPythonAPI
|
||||
from helpers.services.llama_cpp import QuantisationExecutor
|
||||
from helpers.utils.config_parser import ConfigParser
|
||||
from helpers.utils.tensor_mapping import TensorMapper
|
||||
|
||||
|
@ -32,30 +32,28 @@ class QuantisationEngine:
|
|||
|
||||
Provides flexible quantisation execution supporting multiple tensor
|
||||
precision configurations, importance matrices, and fallback strategies.
|
||||
Uses llama-cpp-python API for direct quantisation without subprocess overhead.
|
||||
Uses direct llama.cpp binary execution with proper tensor overrides.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialise quantisation engine."""
|
||||
self.fs = FilesystemService()
|
||||
self.python_api = LlamaCppPythonAPI()
|
||||
self.executor = QuantisationExecutor()
|
||||
|
||||
def quantise(self, context: QuantisationContext) -> QuantisationResult:
|
||||
"""Perform quantisation using the specified configuration.
|
||||
|
||||
Executes quantisation using Python API. Since llama-cpp-python is a
|
||||
required dependency, we can rely on it being available.
|
||||
Executes quantisation using direct llama.cpp binary with proper
|
||||
tensor override flags for L and XL variants.
|
||||
|
||||
Returns:
|
||||
QuantisationResult with success status and file information.
|
||||
"""
|
||||
logger.debug(f"DEBUG: Starting quantisation for {context.config.name}")
|
||||
logger.info(
|
||||
f"⚙️ Creating {context.config.name} quantisation ({context.config.description})..."
|
||||
)
|
||||
|
||||
output_path = context.get_output_path()
|
||||
logger.debug(f"DEBUG: Output path: {output_path}")
|
||||
|
||||
# Check input file exists and is readable
|
||||
if not context.f16_model_path.exists():
|
||||
|
@ -67,34 +65,20 @@ class QuantisationEngine:
|
|||
error_message=error_msg,
|
||||
)
|
||||
|
||||
# Check if we have enough disk space (rough estimate)
|
||||
try:
|
||||
input_size = context.f16_model_path.stat().st_size
|
||||
logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB")
|
||||
# This is a rough check - actual available space calculation is more complex
|
||||
logger.debug(f"DEBUG: Output directory: {output_path.parent}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not check disk space: {e}")
|
||||
|
||||
logger.info(f"🎯 Attempting {context.config.name} quantisation...")
|
||||
logger.debug(f"DEBUG: Source: {context.f16_model_path}")
|
||||
logger.debug(f"DEBUG: Target: {output_path}")
|
||||
logger.debug(f"DEBUG: imatrix: {context.imatrix_path}")
|
||||
logger.info(f"📝 Source: {context.f16_model_path}")
|
||||
logger.info(f"📝 Target: {output_path}")
|
||||
|
||||
try:
|
||||
# Use Python API for quantisation
|
||||
logger.info("🐍 Using Python API for quantisation...")
|
||||
logger.debug("DEBUG: Calling python_api.quantise_model...")
|
||||
# Use direct binary execution for quantisation
|
||||
logger.info("🔧 Using llama.cpp binary for quantisation...")
|
||||
|
||||
success = self.python_api.quantise_model(
|
||||
success = self.executor.execute_quantisation(
|
||||
context.f16_model_path, output_path, context.config, context.imatrix_path
|
||||
)
|
||||
|
||||
logger.debug(f"DEBUG: Python API returned: {success}")
|
||||
|
||||
if success:
|
||||
logger.debug("DEBUG: Quantisation successful, creating success result")
|
||||
return self._create_success_result(context.config.name, output_path, "Python API")
|
||||
return self._create_success_result(context.config.name, output_path, "llama.cpp")
|
||||
|
||||
logger.error(f"❌ {context.config.name} quantisation failed")
|
||||
return QuantisationResult(
|
||||
|
@ -175,7 +159,7 @@ class ModelManager:
|
|||
logger.info(f"⬇️ Downloading GGUF file from repository: {model_source.source_model}")
|
||||
logger.info(f"🔍 Looking for file pattern: *{model_source.gguf_file_pattern}*")
|
||||
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info(f"✅ Found existing F16 file: {f16_model.name}")
|
||||
|
@ -339,9 +323,17 @@ class ModelManager:
|
|||
Raises:
|
||||
RuntimeError: If download fails.
|
||||
"""
|
||||
# Ensure the model directory and .huggingface subdirectory exist
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
huggingface_dir = model_dir / ".huggingface"
|
||||
huggingface_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
logger.debug(f"DEBUG: Downloading full repository: {source_model}")
|
||||
result = subprocess.run(
|
||||
logger.info(f"⬇️ Downloading full repository: {source_model}")
|
||||
logger.info("📊 Progress will be shown below...")
|
||||
|
||||
# Use subprocess.Popen to stream output in real-time
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
"huggingface-cli",
|
||||
"download",
|
||||
|
@ -349,13 +341,34 @@ class ModelManager:
|
|||
"--local-dir",
|
||||
str(model_dir),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1, # Line buffered
|
||||
universal_newlines=True,
|
||||
)
|
||||
logger.debug(
|
||||
f"DEBUG: Repository download completed with return code {result.returncode}"
|
||||
)
|
||||
|
||||
# Stream output line by line
|
||||
for line in process.stdout:
|
||||
# Log download progress lines
|
||||
if line.strip():
|
||||
# Check if it's a progress line (contains %)
|
||||
if "%" in line or "Downloading" in line or "Fetching" in line:
|
||||
# Use info level for progress lines
|
||||
logger.info(f" {line.strip()}")
|
||||
else:
|
||||
# Use debug for other output
|
||||
logger.debug(f" {line.strip()}")
|
||||
|
||||
# Wait for process to complete
|
||||
return_code = process.wait()
|
||||
|
||||
if return_code != 0:
|
||||
msg = f"Repository download failed with return code {return_code}"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
logger.info("✅ Repository download completed successfully")
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"❌ Failed to download repository {source_model}")
|
||||
logger.error(f"Return code: {e.returncode}")
|
||||
|
@ -386,7 +399,7 @@ class ModelManager:
|
|||
RuntimeError: If conversion fails.
|
||||
"""
|
||||
logger.info("🔄 Converting to GGUF F16 format...")
|
||||
f16_model = model_dir / f"{model_source.model_name}-f16.gguf"
|
||||
f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf"
|
||||
|
||||
if f16_model.exists():
|
||||
logger.info("✅ F16 model already exists")
|
||||
|
@ -414,6 +427,28 @@ class ModelManager:
|
|||
if arch != arch_name:
|
||||
logger.info(f"📝 Architecture mapping: {arch_name} → {arch}")
|
||||
|
||||
# Check if architecture is supported by llama.cpp
|
||||
supported_archs = {
|
||||
"llama",
|
||||
"qwen2",
|
||||
"gemma",
|
||||
"phi3",
|
||||
"falcon",
|
||||
"gpt2",
|
||||
"gptj",
|
||||
"gptneox",
|
||||
"mpt",
|
||||
"baichuan",
|
||||
"stablelm",
|
||||
}
|
||||
|
||||
if arch not in supported_archs:
|
||||
logger.warning("=" * 70)
|
||||
logger.warning(f"⚠️ Architecture '{arch_name}' may not be supported by llama.cpp")
|
||||
logger.warning(f"⚠️ The GGUF will be created with architecture: '{arch}'")
|
||||
logger.warning("⚠️ Check if your inference software supports this architecture.")
|
||||
logger.warning("=" * 70)
|
||||
|
||||
# Convert using GGUFConverter
|
||||
tensor_mapper = TensorMapper()
|
||||
success = GGUFConverter.convert_safetensors(
|
||||
|
|
|
@ -107,28 +107,44 @@ class ConfigParser:
|
|||
|
||||
@staticmethod
|
||||
def get_architecture_mapping(architecture: str) -> str:
|
||||
"""Map architecture names to known GGUF architectures.
|
||||
"""Get the GGUF architecture name for a model.
|
||||
|
||||
Provides fallback mappings for architectures not directly supported
|
||||
by GGUF format, translating them to similar known architectures. This
|
||||
enables broader model compatibility whilst maintaining GGUF standards.
|
||||
Returns the original architecture name to preserve model identity.
|
||||
Only maps architectures that are truly compatible.
|
||||
|
||||
Returns:
|
||||
GGUF-compatible architecture name with appropriate fallback to llama.
|
||||
Architecture name for GGUF, preserving original when possible.
|
||||
"""
|
||||
# Architecture mappings to known GGUF types
|
||||
mappings = {
|
||||
"DotsOCRForCausalLM": "qwen2", # Similar architecture
|
||||
"GptOssForCausalLM": "llama", # Use llama as fallback
|
||||
"MistralForCausalLM": "llama", # Mistral is llama-like
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
# Only map architectures that are ACTUALLY the same
|
||||
# DO NOT map incompatible architectures
|
||||
known_compatible = {
|
||||
"LlamaForCausalLM": "llama",
|
||||
"MistralForCausalLM": "llama", # Mistral IS llama-compatible
|
||||
"Qwen2ForCausalLM": "qwen2",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Phi3ForCausalLM": "phi3",
|
||||
# Add more mappings as needed
|
||||
"FalconForCausalLM": "falcon",
|
||||
"GPT2LMHeadModel": "gpt2",
|
||||
"GPTJForCausalLM": "gptj",
|
||||
"GPTNeoXForCausalLM": "gptneox",
|
||||
"MPTForCausalLM": "mpt",
|
||||
"BaichuanForCausalLM": "baichuan",
|
||||
"StableLMEpochForCausalLM": "stablelm",
|
||||
}
|
||||
|
||||
return mappings.get(architecture, "llama") # Default to llama
|
||||
if architecture in known_compatible:
|
||||
return known_compatible[architecture]
|
||||
|
||||
# For unknown architectures, preserve the original name
|
||||
# This will make it clear the model needs proper support
|
||||
# Remove common suffixes to get cleaner architecture name
|
||||
arch_name = architecture
|
||||
for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]:
|
||||
if arch_name.endswith(suffix):
|
||||
arch_name = arch_name[: -len(suffix)]
|
||||
break
|
||||
|
||||
return arch_name.lower()
|
||||
|
||||
@staticmethod
|
||||
def load_tokeniser_config(model_path: Path) -> dict[str, Any]:
|
||||
|
@ -155,11 +171,33 @@ class ConfigParser:
|
|||
|
||||
config = fs.load_json_config(tokeniser_config_path)
|
||||
|
||||
# Extract token IDs with defaults
|
||||
# Try to find special token IDs from added_tokens_decoder
|
||||
added_tokens = config.get("added_tokens_decoder", {})
|
||||
eos_token_id = config.get("eos_token_id")
|
||||
bos_token_id = config.get("bos_token_id")
|
||||
|
||||
# If not directly specified, search in added_tokens_decoder
|
||||
if eos_token_id is None:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
if token_info.get("content") == "<|endoftext|>":
|
||||
eos_token_id = int(token_id)
|
||||
break
|
||||
|
||||
if bos_token_id is None:
|
||||
for token_id, token_info in added_tokens.items():
|
||||
if token_info.get("content") in {"<|im_start|>", "<s>", "<|startoftext|>"}:
|
||||
bos_token_id = int(token_id)
|
||||
break
|
||||
|
||||
# Extract token IDs with better defaults
|
||||
return {
|
||||
"bos_token_id": config.get("bos_token_id", 1),
|
||||
"eos_token_id": config.get("eos_token_id", 2),
|
||||
"bos_token_id": bos_token_id if bos_token_id is not None else 1,
|
||||
"eos_token_id": eos_token_id if eos_token_id is not None else 2,
|
||||
"unk_token_id": config.get("unk_token_id", 0),
|
||||
"pad_token_id": config.get("pad_token_id", 0),
|
||||
"pad_token_id": config.get(
|
||||
"pad_token_id", eos_token_id if eos_token_id is not None else 0
|
||||
),
|
||||
"model_type": config.get("model_type", "llama"),
|
||||
"add_bos_token": config.get("add_bos_token", True),
|
||||
"add_eos_token": config.get("add_eos_token", False),
|
||||
}
|
||||
|
|
40
uv.lock
generated
40
uv.lock
generated
|
@ -496,26 +496,26 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "uv"
|
||||
version = "0.8.6"
|
||||
version = "0.8.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" },
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue